From a59d8c977eac3eb5c71870815730e4c9bf35bad2 Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Thu, 22 Sep 2016 16:55:33 +0300 Subject: parser into csv --- .gitignore | 2 + .idea/workspace.xml | 148 +++++++++++++++++++++++++++++++++----------------- entities/location.py | 10 ++++ entities/person.py | 8 +++ readers/xml_reader.py | 71 +++++++++++------------- 5 files changed, 151 insertions(+), 88 deletions(-) diff --git a/.gitignore b/.gitignore index 27d1f89..7ebdd82 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,5 @@ ENV/ # Rope project settings .ropeproject + +.out/* \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index f527370..7a9b802 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,9 +2,10 @@ + - - + + @@ -19,7 +20,7 @@ - + - + - - + + @@ -55,7 +56,7 @@ - + @@ -65,7 +66,7 @@ - + @@ -78,9 +79,19 @@ - - - + + + + + + + + + + + + + @@ -95,12 +106,15 @@ - + - - - + + + + + + @@ -149,19 +163,19 @@ @@ -602,7 +616,14 @@ \ No newline at end of file diff --git a/entities/location.py b/entities/location.py index 064b193..a43eb8d 100644 --- a/entities/location.py +++ b/entities/location.py @@ -1,3 +1,5 @@ +import json + from entities.basic_entity import BasicEntity @@ -8,8 +10,16 @@ class Location(BasicEntity): self.name_in_langs = name_in_langs self.comments_list = comments_list + CSV_FIELDS = ["name", "comments"] + TYPE = "LOCATION" + + def print_entity(self): print("Name = " + self.name) print("Name in langs = " + str(self.name_in_langs)) print("Types = " + str(self.types_of_place)) print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py index c6db584..b315aac 100644 --- a/entities/person.py +++ b/entities/person.py @@ -1,3 +1,5 @@ +import json + from entities.basic_entity import BasicEntity @@ -57,6 +59,8 @@ class Person(BasicEntity): self.comments_list = comments_list self.profession = profession + CSV_FIELDS = ["name", "biodata", "comments"] + TYPE = 'PERSON' def print_entity(self): print("Name = " + self.name) @@ -66,3 +70,7 @@ class Person(BasicEntity): print("Bio Data = " + str(self.bio_data)) print("Comments = " + str(self.comments_list)) print("Profession = " + str(self.profession)) + + def to_csv_dict(self): + return {'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 2aaf8c6..af80e25 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -2,14 +2,13 @@ import json import csv import parsers, factories +from entities import Person try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET -CSV_FIELDS = ["name", "biodata", "comments"] - def read_file(path, element_key): # get an iterable record_counter = 0 @@ -23,41 +22,37 @@ def read_file(path, element_key): #the factory inl_factory = factories.INLFactory() - with open('out.csv', 'w', encoding='utf8') as f: - writer = csv.DictWriter(f, CSV_FIELDS) - writer.writeheader() - f667 = open("667.txt", 'w', encoding="utf8") - f678 = open("678.txt", 'w', encoding="utf8") - for event, element in context: - if 'end' in event: - if element_key in element.tag: - #enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - - #test print the entity - if entity != None: - json_entity = entity.to_json() - print(json_entity) - writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)}) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - - #TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) - element.clear() - f667.close() - f678.close() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + #enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + #test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + + #TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) + element.clear() if __name__ == '__main__': read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') -- cgit v1.2.3