summaryrefslogtreecommitdiff
path: root/readers/xml_reader.py
diff options
context:
space:
mode:
Diffstat (limited to 'readers/xml_reader.py')
-rw-r--r--readers/xml_reader.py71
1 files changed, 33 insertions, 38 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
index 2aaf8c6..af80e25 100644
--- a/readers/xml_reader.py
+++ b/readers/xml_reader.py
@@ -2,14 +2,13 @@
import json
import csv
import parsers, factories
+from entities import Person
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
-CSV_FIELDS = ["name", "biodata", "comments"]
-
def read_file(path, element_key):
# get an iterable
record_counter = 0
@@ -23,41 +22,37 @@ def read_file(path, element_key):
#the factory
inl_factory = factories.INLFactory()
- with open('out.csv', 'w', encoding='utf8') as f:
- writer = csv.DictWriter(f, CSV_FIELDS)
- writer.writeheader()
- f667 = open("667.txt", 'w', encoding="utf8")
- f678 = open("678.txt", 'w', encoding="utf8")
- for event, element in context:
- if 'end' in event:
- if element_key in element.tag:
- #enter the processing here
- record_counter += 1
-
- #cleaned element is a tree
- inl_parser = parsers.INLXmlParser(element)
- cleaned_element = inl_parser.clearxml()
- entity = inl_factory.get_entity(cleaned_element)
-
-
- #test print the entity
- if entity != None:
- json_entity = entity.to_json()
- print(json_entity)
- writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)})
- # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
- # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
-
- # entity.print_entity()
-
-
- #TODO analys and upload the entity
-
-
- # import pdb; pdb.set_trace()
- #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text)
- element.clear()
- f667.close()
- f678.close()
+ files = {}
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ #enter the processing here
+ record_counter += 1
+
+ #cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+
+ #test print the entity
+ if entity != None:
+ if entity.TYPE not in files:
+ files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+ json_entity = entity.to_json()
+ print(json_entity)
+ writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+ writer.writerow(entity.to_csv_dict())
+ # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+ # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+ # entity.print_entity()
+
+
+ #TODO analys and upload the entity
+
+
+ # import pdb; pdb.set_trace()
+ #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text)
+ element.clear()
if __name__ == '__main__':
read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record')