diff options
Diffstat (limited to 'readers/xml_reader.py')
-rw-r--r-- | readers/xml_reader.py | 122 |
1 files changed, 61 insertions, 61 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 5b2d1fd..710899d 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import -import json -import csv -import parsers, factories -from entities import Person - -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -def read_file(path, element_key): - # get an iterable - record_counter = 0 - context = ET.iterparse(path, events=("start", "end")) - - # turn it into an iterator - context = iter(context) - - # get the root element - event, root = context.__next__() - - # the factory - inl_factory = factories.INLFactory() - files = {} - for event, element in context: - if 'end' in event: - if element_key in element.tag: - # enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - # test print the entity - if entity != None: - if entity.TYPE not in files: - files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') - json_entity = entity.to_json() - print(json_entity) - writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) - writer.writerow(entity.to_csv_dict()) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - # TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', - cleaned_element.getroot().text) - element.clear() - print(record_counter) - - -if __name__ == '__main__': - read_file(r"../../NLI-nnl10.xml", 'record') +# from __future__ import absolute_import
+import json
+import csv
+import parsers, factories
+from entities import Person
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+def read_file(path, element_key):
+ # get an iterable
+ record_counter = 0
+ context = ET.iterparse(path, events=("start", "end"))
+
+ # turn it into an iterator
+ context = iter(context)
+
+ # get the root element
+ event, root = context.__next__()
+
+ # the factory
+ inl_factory = factories.INLFactory()
+ files = {}
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ # enter the processing here
+ record_counter += 1
+
+ #cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+
+ # test print the entity
+ if entity != None:
+ if entity.TYPE not in files:
+ files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+ json_entity = entity.to_json()
+ print(json_entity)
+ writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+ writer.writerow(entity.to_csv_dict())
+ # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+ # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+ # entity.print_entity()
+
+ # TODO analys and upload the entity
+
+
+ # import pdb; pdb.set_trace()
+ print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
+ cleaned_element.getroot().text)
+ element.clear()
+ print(record_counter)
+
+
+if __name__ == '__main__':
+ read_file(r"../../NLI-nnl10.xml", 'record')
|