diff options
Diffstat (limited to 'readers')
-rw-r--r-- | readers/xml_reader.py | 71 |
1 files changed, 33 insertions, 38 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 2aaf8c6..af80e25 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -2,14 +2,13 @@ import json import csv import parsers, factories +from entities import Person try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET -CSV_FIELDS = ["name", "biodata", "comments"] - def read_file(path, element_key): # get an iterable record_counter = 0 @@ -23,41 +22,37 @@ def read_file(path, element_key): #the factory inl_factory = factories.INLFactory() - with open('out.csv', 'w', encoding='utf8') as f: - writer = csv.DictWriter(f, CSV_FIELDS) - writer.writeheader() - f667 = open("667.txt", 'w', encoding="utf8") - f678 = open("678.txt", 'w', encoding="utf8") - for event, element in context: - if 'end' in event: - if element_key in element.tag: - #enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - - #test print the entity - if entity != None: - json_entity = entity.to_json() - print(json_entity) - writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)}) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - - #TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) - element.clear() - f667.close() - f678.close() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + #enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + #test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + + #TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) + element.clear() if __name__ == '__main__': read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') |