diff options
Diffstat (limited to 'readers/xml_reader.py')
-rw-r--r-- | readers/xml_reader.py | 122 |
1 files changed, 61 insertions, 61 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 710899d..5b2d1fd 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import
-import json
-import csv
-import parsers, factories
-from entities import Person
-
-try:
- import xml.etree.cElementTree as ET
-except ImportError:
- import xml.etree.ElementTree as ET
-
-def read_file(path, element_key):
- # get an iterable
- record_counter = 0
- context = ET.iterparse(path, events=("start", "end"))
-
- # turn it into an iterator
- context = iter(context)
-
- # get the root element
- event, root = context.__next__()
-
- # the factory
- inl_factory = factories.INLFactory()
- files = {}
- for event, element in context:
- if 'end' in event:
- if element_key in element.tag:
- # enter the processing here
- record_counter += 1
-
- #cleaned element is a tree
- inl_parser = parsers.INLXmlParser(element)
- cleaned_element = inl_parser.clearxml()
- entity = inl_factory.get_entity(cleaned_element)
-
- # test print the entity
- if entity != None:
- if entity.TYPE not in files:
- files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
- json_entity = entity.to_json()
- print(json_entity)
- writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
- writer.writerow(entity.to_csv_dict())
- # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
- # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
-
- # entity.print_entity()
-
- # TODO analys and upload the entity
-
-
- # import pdb; pdb.set_trace()
- print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
- cleaned_element.getroot().text)
- element.clear()
- print(record_counter)
-
-
-if __name__ == '__main__':
- read_file(r"../../NLI-nnl10.xml", 'record')
+# from __future__ import absolute_import +import json +import csv +import parsers, factories +from entities import Person + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + # the factory + inl_factory = factories.INLFactory() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + # enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + # test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + # TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', + cleaned_element.getroot().text) + element.clear() + print(record_counter) + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') |