# from __future__ import absolute_import import json import csv import parsers, factories try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET CSV_FIELDS = ["name", "biodata", "comments"] def read_file(path, element_key): # get an iterable record_counter = 0 context = ET.iterparse(path, events=("start", "end")) # turn it into an iterator context = iter(context) # get the root element event, root = context.__next__() #the factory inl_factory = factories.INLFactory() with open('out.csv', 'w', encoding='utf8') as f: writer = csv.DictWriter(f, CSV_FIELDS) writer.writeheader() f667 = open("667.txt", 'w', encoding="utf8") f678 = open("678.txt", 'w', encoding="utf8") for event, element in context: if 'end' in event: if element_key in element.tag: #enter the processing here record_counter += 1 #cleaned element is a tree inl_parser = parsers.INLXmlParser(element) cleaned_element = inl_parser.clearxml() entity = inl_factory.get_entity(cleaned_element) #test print the entity if entity != None: json_entity = entity.to_json() print(json_entity) writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)}) # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) # entity.print_entity() #TODO analys and upload the entity # import pdb; pdb.set_trace() #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) element.clear() f667.close() f678.close() if __name__ == '__main__': read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record')