summaryrefslogtreecommitdiff
path: root/readers/xml_reader.py
diff options
context:
space:
mode:
Diffstat (limited to 'readers/xml_reader.py')
-rw-r--r--readers/xml_reader.py67
1 files changed, 39 insertions, 28 deletions
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
index ec2c696..2aaf8c6 100644
--- a/readers/xml_reader.py
+++ b/readers/xml_reader.py
@@ -1,4 +1,6 @@
# from __future__ import absolute_import
+import json
+import csv
import parsers, factories
try:
@@ -6,6 +8,7 @@ try:
except ImportError:
import xml.etree.ElementTree as ET
+CSV_FIELDS = ["name", "biodata", "comments"]
def read_file(path, element_key):
# get an iterable
@@ -20,33 +23,41 @@ def read_file(path, element_key):
#the factory
inl_factory = factories.INLFactory()
-
- for event, element in context:
- if 'end' in event:
- if element_key in element.tag:
- #enter the processing here
- record_counter += 1
-
- for field in element:
- print(field.tag, field.attrib)
-
- #cleaned element is a tree
- inl_parser = parsers.INLXmlParser(element)
- cleaned_element = inl_parser.clearxml()
- entity = inl_factory.get_entity(cleaned_element)
-
- #test print the entity
- if entity != None:
- entity.print_entity()
-
-
- #TODO analys and upload the entity
-
-
- # import pdb; pdb.set_trace()
- print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text)
- element.clear()
-
-
+ with open('out.csv', 'w', encoding='utf8') as f:
+ writer = csv.DictWriter(f, CSV_FIELDS)
+ writer.writeheader()
+ f667 = open("667.txt", 'w', encoding="utf8")
+ f678 = open("678.txt", 'w', encoding="utf8")
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ #enter the processing here
+ record_counter += 1
+
+ #cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+
+
+ #test print the entity
+ if entity != None:
+ json_entity = entity.to_json()
+ print(json_entity)
+ writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)})
+ # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+ # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+ # entity.print_entity()
+
+
+ #TODO analys and upload the entity
+
+
+ # import pdb; pdb.set_trace()
+ #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text)
+ element.clear()
+ f667.close()
+ f678.close()
if __name__ == '__main__':
read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record')