summaryrefslogtreecommitdiff
path: root/readers/xml_reader.py
diff options
context:
space:
mode:
Diffstat (limited to 'readers/xml_reader.py')
-rw-r--r--readers/xml_reader.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
new file mode 100644
index 0000000..8a819b0
--- /dev/null
+++ b/readers/xml_reader.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+import parsers
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+
+def read_file(path, element_key):
+ # get an iterable
+ record_counter = 0
+ context = ET.iterparse(path, events=("start", "end"))
+
+ # turn it into an iterator
+ context = iter(context)
+
+ # get the root element
+ event, root = context.next()
+
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ record_counter += 1
+ cleaned_element = parsers.INLXmlParser(element).clearxml()
+ print record_counter, cleaned_element.getroot().attrib
+ element.clear()
+
+
+if __name__ == '__main__':
+ read_file(r"../../NLI-nnl10.xml", 'record')