diff options
author | roy lewin <roy.lewin@gmail.com> | 2016-09-22 01:43:21 +0300 |
---|---|---|
committer | roy lewin <roy.lewin@gmail.com> | 2016-09-22 01:43:21 +0300 |
commit | 652781137f3856fef98e3063766f9f3b1a984a2e (patch) | |
tree | 88517562c6193cbc48f1d0a044bd6dd97d3a8549 /readers | |
parent | 6beb87f2720b905c6b512a977ba2422fa183a832 (diff) |
Added xml_reader.py, and edited INL_xml_parser to work on a per-record basis
Diffstat (limited to 'readers')
-rw-r--r-- | readers/__init__.py | 0 | ||||
-rw-r--r-- | readers/xml_reader.py | 31 |
2 files changed, 31 insertions, 0 deletions
diff --git a/readers/__init__.py b/readers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/readers/__init__.py diff --git a/readers/xml_reader.py b/readers/xml_reader.py new file mode 100644 index 0000000..8a819b0 --- /dev/null +++ b/readers/xml_reader.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import +import parsers + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.next() + + for event, element in context: + if 'end' in event: + if element_key in element.tag: + record_counter += 1 + cleaned_element = parsers.INLXmlParser(element).clearxml() + print record_counter, cleaned_element.getroot().attrib + element.clear() + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') |