summaryrefslogtreecommitdiff
path: root/parsers
diff options
context:
space:
mode:
authorIdo Ivri <ido@zencity.io>2016-10-30 19:47:28 +0200
committerIdo Ivri <ido@zencity.io>2016-10-30 19:47:28 +0200
commitcfed90c1096a92c4c9e622dfe2d55d892595b2ff (patch)
treedf8a996130a7ad199ede2312412d798147576d84 /parsers
initial commit of work done in DataHack
Diffstat (limited to 'parsers')
-rwxr-xr-xparsers/INL_xml_parser.py41
-rwxr-xr-xparsers/__init__.py2
-rwxr-xr-xparsers/basic_parser.py6
3 files changed, 49 insertions, 0 deletions
diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py
new file mode 100755
index 0000000..f90e778
--- /dev/null
+++ b/parsers/INL_xml_parser.py
@@ -0,0 +1,41 @@
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374', '046', '901', '001']
+
+
+class INLXmlParser:
+ def __init__(self, reader, whitelist=TAG_WHITELIST):
+ self.reader = reader
+ # self.whitelist = whitelist or KNOWN_FIELD_TAGS
+ self.whitelist = whitelist
+
+ def clearxml(self):
+
+ # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+ # for record in root: # create new record
+ newRecord = ET.Element('record')
+ for field in self.reader:
+ fieldtag = field.attrib.get('tag')
+ if fieldtag in self.whitelist:
+ temptag = fieldtag
+ if fieldtag == '001':
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': '001'})
+ newTag.text = field.text
+ else:
+ # tag 700 and 400 are the same
+ if temptag == '700':
+ temptag = '400'
+ for data in field:
+ newFieldTag = temptag
+ newFieldTag += '.'
+ newFieldTag += data.attrib.get('code')
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+ newTag.text = data.text
+
+ newRecordTree = ET.ElementTree(newRecord)
+ return ET.ElementTree(newRecord)
diff --git a/parsers/__init__.py b/parsers/__init__.py
new file mode 100755
index 0000000..d32c917
--- /dev/null
+++ b/parsers/__init__.py
@@ -0,0 +1,2 @@
+
+from .INL_xml_parser import INLXmlParser \ No newline at end of file
diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py
new file mode 100755
index 0000000..dae19cb
--- /dev/null
+++ b/parsers/basic_parser.py
@@ -0,0 +1,6 @@
+class BasicParser(object):
+ def __init__(self):
+ pass
+
+ def parse(self, data):
+ raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))