From 3615e3968baa5a464a2725e280a7e0f89e3428cf Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Thu, 22 Sep 2016 16:34:57 +0300 Subject: parser into csv --- readers/xml_reader.py | 67 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'readers/xml_reader.py') diff --git a/readers/xml_reader.py b/readers/xml_reader.py index ec2c696..2aaf8c6 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,4 +1,6 @@ # from __future__ import absolute_import +import json +import csv import parsers, factories try: @@ -6,6 +8,7 @@ try: except ImportError: import xml.etree.ElementTree as ET +CSV_FIELDS = ["name", "biodata", "comments"] def read_file(path, element_key): # get an iterable @@ -20,33 +23,41 @@ def read_file(path, element_key): #the factory inl_factory = factories.INLFactory() - - for event, element in context: - if 'end' in event: - if element_key in element.tag: - #enter the processing here - record_counter += 1 - - for field in element: - print(field.tag, field.attrib) - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - #test print the entity - if entity != None: - entity.print_entity() - - - #TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) - element.clear() - - + with open('out.csv', 'w', encoding='utf8') as f: + writer = csv.DictWriter(f, CSV_FIELDS) + writer.writeheader() + f667 = open("667.txt", 'w', encoding="utf8") + f678 = open("678.txt", 'w', encoding="utf8") + for event, element in context: + if 'end' in event: + if element_key in element.tag: + #enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + + #test print the entity + if entity != None: + json_entity = entity.to_json() + print(json_entity) + writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)}) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + + #TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) + element.clear() + f667.close() + f678.close() if __name__ == '__main__': read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') -- cgit v1.2.3