From 652781137f3856fef98e3063766f9f3b1a984a2e Mon Sep 17 00:00:00 2001 From: roy lewin Date: Thu, 22 Sep 2016 01:43:21 +0300 Subject: Added xml_reader.py, and edited INL_xml_parser to work on a per-record basis --- .idea/misc.xml | 3 + .idea/workspace.xml | 213 ++++++++++++++++++++++++++++++++++++++-------- __init__.py | 0 parsers/INL_xml_parser.py | 47 +++++----- parsers/__init__.py | 1 + readers/__init__.py | 0 readers/xml_reader.py | 31 +++++++ 7 files changed, 236 insertions(+), 59 deletions(-) create mode 100644 __init__.py create mode 100644 readers/__init__.py create mode 100644 readers/xml_reader.py diff --git a/.idea/misc.xml b/.idea/misc.xml index de9bbc8..f38dd77 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 1a3da85..73e2873 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,7 +1,15 @@ - + + + + + + + + + @@ -12,6 +20,9 @@ + + + @@ -31,11 +42,11 @@ - + - - + + @@ -43,6 +54,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -80,6 +133,10 @@ @@ -128,6 +185,30 @@ - - - - - - - - @@ -545,13 +646,55 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 2ea9a9b..512d46e 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -1,34 +1,33 @@ -import xml.etree.cElementTree as ET +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] class INLXmlParser: - def __init__(self, lst, path): - self.whitelist = lst - self.xmlpath = path + def __init__(self, reader, whitelist=None): + self.reader = reader + self.whitelist = whitelist or KNOWN_FIELD_TAGS def clearxml(self): - xmltree = ET.parse(self.xmlpath) - # root == list of records - root = xmltree.getroot() - - # create new data newTreeRoot = ET.Element('data') - # scan the datafields in the records and copy to the new one only the tags in the whitelist - for record in root: - # create new record - newRecord = ET.SubElement(newTreeRoot, 'record') - for field in record: - fieldtag = field.attrib.get('tag') - if fieldtag in self.whitelist: - newFieldTag = fieldtag - # tag 700 and 400 are the same - if newFieldTag == '700': - newFieldTag = '400' - newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) - for data in field: - subData = ET.SubElement(newTag, data.tag, data.attrib) - subData.text = data.text + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.SubElement(newTreeRoot, 'record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + newFieldTag = fieldtag + # tag 700 and 400 are the same + if newFieldTag == '700': + newFieldTag = '400' + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + for data in field: + subData = ET.SubElement(newTag, data.tag, data.attrib) + subData.text = data.text newTree = ET.ElementTree(newTreeRoot) return newTree diff --git a/parsers/__init__.py b/parsers/__init__.py index e69de29..e3a246d 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -0,0 +1 @@ +from INL_xml_parser import INLXmlParser \ No newline at end of file diff --git a/readers/__init__.py b/readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/readers/xml_reader.py b/readers/xml_reader.py new file mode 100644 index 0000000..8a819b0 --- /dev/null +++ b/readers/xml_reader.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import +import parsers + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.next() + + for event, element in context: + if 'end' in event: + if element_key in element.tag: + record_counter += 1 + cleaned_element = parsers.INLXmlParser(element).clearxml() + print record_counter, cleaned_element.getroot().attrib + element.clear() + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') -- cgit v1.2.3