From cfed90c1096a92c4c9e622dfe2d55d892595b2ff Mon Sep 17 00:00:00 2001 From: Ido Ivri Date: Sun, 30 Oct 2016 19:47:28 +0200 Subject: initial commit of work done in DataHack --- parsers/INL_xml_parser.py | 41 +++++++++++++++++++++++++++++++++++++++++ parsers/__init__.py | 2 ++ parsers/basic_parser.py | 6 ++++++ 3 files changed, 49 insertions(+) create mode 100755 parsers/INL_xml_parser.py create mode 100755 parsers/__init__.py create mode 100755 parsers/basic_parser.py (limited to 'parsers') diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py new file mode 100755 index 0000000..f90e778 --- /dev/null +++ b/parsers/INL_xml_parser.py @@ -0,0 +1,41 @@ +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] + +TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374', '046', '901', '001'] + + +class INLXmlParser: + def __init__(self, reader, whitelist=TAG_WHITELIST): + self.reader = reader + # self.whitelist = whitelist or KNOWN_FIELD_TAGS + self.whitelist = whitelist + + def clearxml(self): + + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.Element('record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + temptag = fieldtag + if fieldtag == '001': + newTag = ET.SubElement(newRecord, 'datafield', {'tag': '001'}) + newTag.text = field.text + else: + # tag 700 and 400 are the same + if temptag == '700': + temptag = '400' + for data in field: + newFieldTag = temptag + newFieldTag += '.' + newFieldTag += data.attrib.get('code') + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + newTag.text = data.text + + newRecordTree = ET.ElementTree(newRecord) + return ET.ElementTree(newRecord) diff --git a/parsers/__init__.py b/parsers/__init__.py new file mode 100755 index 0000000..d32c917 --- /dev/null +++ b/parsers/__init__.py @@ -0,0 +1,2 @@ + +from .INL_xml_parser import INLXmlParser \ No newline at end of file diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py new file mode 100755 index 0000000..dae19cb --- /dev/null +++ b/parsers/basic_parser.py @@ -0,0 +1,6 @@ +class BasicParser(object): + def __init__(self): + pass + + def parse(self, data): + raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) -- cgit v1.2.3