From d646c9a42273e98c85602f5618598125007bbfaa Mon Sep 17 00:00:00 2001 From: Tzafrir Cohen Date: Sun, 25 Sep 2016 20:28:16 +0300 Subject: WIP: commit all files that were changed --- readers/xml_reader.py | 122 +++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 61 deletions(-) (limited to 'readers/xml_reader.py') diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 5b2d1fd..710899d 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import -import json -import csv -import parsers, factories -from entities import Person - -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -def read_file(path, element_key): - # get an iterable - record_counter = 0 - context = ET.iterparse(path, events=("start", "end")) - - # turn it into an iterator - context = iter(context) - - # get the root element - event, root = context.__next__() - - # the factory - inl_factory = factories.INLFactory() - files = {} - for event, element in context: - if 'end' in event: - if element_key in element.tag: - # enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - # test print the entity - if entity != None: - if entity.TYPE not in files: - files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') - json_entity = entity.to_json() - print(json_entity) - writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) - writer.writerow(entity.to_csv_dict()) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - # TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', - cleaned_element.getroot().text) - element.clear() - print(record_counter) - - -if __name__ == '__main__': - read_file(r"../../NLI-nnl10.xml", 'record') +# from __future__ import absolute_import +import json +import csv +import parsers, factories +from entities import Person + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + # the factory + inl_factory = factories.INLFactory() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + # enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + # test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + # TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', + cleaned_element.getroot().text) + element.clear() + print(record_counter) + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') -- cgit v1.2.3