diff options
author | gilad_ilsar <gandismidas1> | 2016-09-22 16:55:33 +0300 |
---|---|---|
committer | gilad_ilsar <gandismidas1> | 2016-09-22 16:55:33 +0300 |
commit | a59d8c977eac3eb5c71870815730e4c9bf35bad2 (patch) | |
tree | 9bf395039f55e54de512f84f20c20b93a460b977 | |
parent | 3615e3968baa5a464a2725e280a7e0f89e3428cf (diff) |
parser into csv
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | .idea/workspace.xml | 148 | ||||
-rw-r--r-- | entities/location.py | 10 | ||||
-rw-r--r-- | entities/person.py | 8 | ||||
-rw-r--r-- | readers/xml_reader.py | 71 |
5 files changed, 151 insertions, 88 deletions
@@ -134,3 +134,5 @@ ENV/ # Rope project settings .ropeproject + +.out/*
\ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index f527370..7a9b802 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,9 +2,10 @@ <project version="4"> <component name="ChangeListManager"> <list default="true" id="1d9b5e9b-4282-4345-b663-d1b92a287a32" name="Default" comment=""> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.gitignore" afterPath="$PROJECT_DIR$/.gitignore" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> - <change type="MODIFICATION" beforePath="$PROJECT_DIR$/factories/INL_factory.py" afterPath="$PROJECT_DIR$/factories/INL_factory.py" /> - <change type="MODIFICATION" beforePath="$PROJECT_DIR$/libs/json_tools.py" afterPath="$PROJECT_DIR$/libs/json_tools.py" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/entities/location.py" afterPath="$PROJECT_DIR$/entities/location.py" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/entities/person.py" afterPath="$PROJECT_DIR$/entities/person.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/readers/xml_reader.py" afterPath="$PROJECT_DIR$/readers/xml_reader.py" /> </list> <ignored path="lib2wiki.iws" /> @@ -19,7 +20,7 @@ </component> <component name="CoverageDataManager"> <SUITE FILE_PATH="coverage/parser$factorytester.coverage" NAME="factorytester Coverage Results" MODIFIED="1474544553528" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/testers" /> - <SUITE FILE_PATH="coverage/parser$xml_reader.coverage" NAME="xml_reader Coverage Results" MODIFIED="1474551147724" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/readers" /> + <SUITE FILE_PATH="coverage/parser$xml_reader.coverage" NAME="xml_reader Coverage Results" MODIFIED="1474552336489" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/readers" /> </component> <component name="CreatePatchCommitExecutor"> <option name="PATCH_PATH" value="" /> @@ -30,11 +31,11 @@ </component> <component name="FileEditorManager"> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> - <file leaf-file-name="xml_reader.py" pinned="false" current-in-tab="true"> + <file leaf-file-name="xml_reader.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/readers/xml_reader.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="136"> - <caret line="26" column="38" selection-start-line="26" selection-start-column="38" selection-end-line="26" selection-end-column="38" /> + <state relative-caret-position="301"> + <caret line="36" column="38" selection-start-line="36" selection-start-column="38" selection-end-line="36" selection-end-column="38" /> <folding> <element signature="e#42#53#0" expanded="true" /> </folding> @@ -55,7 +56,7 @@ <file leaf-file-name="INL_xml_parser.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/parsers/INL_xml_parser.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="160"> + <state relative-caret-position="216"> <caret line="19" column="15" selection-start-line="19" selection-start-column="15" selection-end-line="19" selection-end-column="15" /> <folding /> </state> @@ -65,7 +66,7 @@ <file leaf-file-name="INL_factory.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/factories/INL_factory.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="263"> + <state relative-caret-position="365"> <caret line="113" column="44" selection-start-line="113" selection-start-column="44" selection-end-line="113" selection-end-column="44" /> <folding> <element signature="e#0#15#0" expanded="true" /> @@ -78,9 +79,19 @@ <file leaf-file-name="person.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/entities/person.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="234"> - <caret line="55" column="13" selection-start-line="55" selection-start-column="13" selection-end-line="55" selection-end-column="13" /> - <folding /> + <state relative-caret-position="68"> + <caret line="6" column="55" selection-start-line="6" selection-start-column="55" selection-end-line="6" selection-end-column="55" /> + <folding> + <element signature="e#193#2270#0" expanded="false" /> + <element signature="e#193#478#1" expanded="false" /> + <element signature="e#599#676#0" expanded="false" /> + <element signature="e#703#775#0" expanded="false" /> + <element signature="e#827#1574#1" expanded="false" /> + <element signature="e#1648#2152#0" expanded="false" /> + <element signature="e#1736#2085#0" expanded="false" /> + <element signature="e#1985#2085#0" expanded="false" /> + <element signature="e#2378#2724#0" expanded="false" /> + </folding> </state> </provider> </entry> @@ -95,12 +106,15 @@ </provider> </entry> </file> - <file leaf-file-name="location.py" pinned="false" current-in-tab="false"> + <file leaf-file-name="location.py" pinned="false" current-in-tab="true"> <entry file="file://$PROJECT_DIR$/entities/location.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="170"> - <caret line="10" column="27" selection-start-line="10" selection-start-column="27" selection-end-line="10" selection-end-column="27" /> - <folding /> + <state relative-caret-position="99"> + <caret line="6" column="64" selection-start-line="6" selection-start-column="64" selection-end-line="6" selection-end-column="64" /> + <folding> + <element signature="e#0#11#0" expanded="true" /> + <element signature="e#174#321#0" expanded="false" /> + </folding> </state> </provider> </entry> @@ -149,19 +163,19 @@ <option value="$PROJECT_DIR$/factories/basic_factory.py" /> <option value="$PROJECT_DIR$/entities/basic_entity.py" /> <option value="$PROJECT_DIR$/entities/institution.py" /> - <option value="$PROJECT_DIR$/.gitignore" /> <option value="$PROJECT_DIR$/entities/__init__.py" /> <option value="$PROJECT_DIR$/factories/__init__.py" /> <option value="$PROJECT_DIR$/libs/__init__.py" /> <option value="$PROJECT_DIR$/entities/testers/factorytester.py" /> <option value="$PROJECT_DIR$/parsers/__init__.py" /> - <option value="$PROJECT_DIR$/entities/location.py" /> <option value="$PROJECT_DIR$/testers/factorytester.py" /> - <option value="$PROJECT_DIR$/entities/person.py" /> <option value="$PROJECT_DIR$/parsers/INL_xml_parser.py" /> <option value="$PROJECT_DIR$/libs/json_tools.py" /> - <option value="$PROJECT_DIR$/readers/xml_reader.py" /> <option value="$PROJECT_DIR$/factories/INL_factory.py" /> + <option value="$PROJECT_DIR$/.gitignore" /> + <option value="$PROJECT_DIR$/entities/person.py" /> + <option value="$PROJECT_DIR$/readers/xml_reader.py" /> + <option value="$PROJECT_DIR$/entities/location.py" /> </list> </option> </component> @@ -602,7 +616,14 @@ <option name="project" value="LOCAL" /> <updated>1474545328764</updated> </task> - <option name="localTasksCounter" value="12" /> + <task id="LOCAL-00012" summary="parser into csv"> + <created>1474551297626</created> + <option name="number" value="00012" /> + <option name="presentableId" value="LOCAL-00012" /> + <option name="project" value="LOCAL" /> + <updated>1474551297626</updated> + </task> + <option name="localTasksCounter" value="13" /> <servers /> </component> <component name="ToolWindowManager"> @@ -616,7 +637,7 @@ <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.3298969" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" /> <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" /> - <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32913387" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> + <window_info id="Run" active="true" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.32913387" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" /> <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" /> @@ -645,7 +666,8 @@ <MESSAGE value="tester and person entity" /> <MESSAGE value="update the loctaion entity" /> <MESSAGE value="updates" /> - <option name="LAST_COMMIT_MESSAGE" value="updates" /> + <MESSAGE value="parser into csv" /> + <option name="LAST_COMMIT_MESSAGE" value="parser into csv" /> </component> <component name="XDebuggerManager"> <breakpoint-manager> @@ -693,7 +715,17 @@ <provider selected="true" editor-type-id="text-editor"> <state relative-caret-position="680"> <caret line="40" column="20" selection-start-line="40" selection-start-column="8" selection-end-line="40" selection-end-column="20" /> - <folding /> + <folding> + <element signature="e#193#2270#0" expanded="false" /> + <element signature="e#193#478#1" expanded="false" /> + <element signature="e#599#676#0" expanded="false" /> + <element signature="e#703#775#0" expanded="false" /> + <element signature="e#827#1574#1" expanded="false" /> + <element signature="e#1648#2152#0" expanded="false" /> + <element signature="e#1736#2085#0" expanded="false" /> + <element signature="e#1985#2085#0" expanded="false" /> + <element signature="e#2378#2724#0" expanded="false" /> + </folding> </state> </provider> </entry> @@ -711,7 +743,10 @@ <provider selected="true" editor-type-id="text-editor"> <state relative-caret-position="68"> <caret line="4" column="51" selection-start-line="4" selection-start-column="51" selection-end-line="4" selection-end-column="51" /> - <folding /> + <folding> + <element signature="e#0#11#0" expanded="true" /> + <element signature="e#174#321#0" expanded="false" /> + </folding> </state> </provider> </entry> @@ -774,14 +809,6 @@ </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/.gitignore"> - <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="255"> - <caret line="15" column="5" selection-start-line="15" selection-start-column="5" selection-end-line="15" selection-end-column="5" /> - <folding /> - </state> - </provider> - </entry> <entry file="file://$PROJECT_DIR$/factories/__init__.py"> <provider selected="true" editor-type-id="text-editor"> <state relative-caret-position="17"> @@ -848,29 +875,39 @@ </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/parsers/INL_xml_parser.py"> + <entry file="file://$PROJECT_DIR$/libs/json_tools.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="160"> - <caret line="19" column="15" selection-start-line="19" selection-start-column="15" selection-end-line="19" selection-end-column="15" /> - <folding /> + <state relative-caret-position="85"> + <caret line="5" column="34" selection-start-line="5" selection-start-column="34" selection-end-line="5" selection-end-column="34" /> + <folding> + <marker date="1474549999557" expanded="true" signature="69:104" ph="..." /> + <marker date="1474549999557" expanded="true" signature="128:189" ph="..." /> + </folding> </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/entities/person.py"> + <entry file="file://$PROJECT_DIR$/.gitignore"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="234"> - <caret line="55" column="13" selection-start-line="55" selection-start-column="13" selection-end-line="55" selection-end-column="13" /> + <state relative-caret-position="495"> + <caret line="137" column="6" selection-start-line="137" selection-start-column="6" selection-end-line="137" selection-end-column="6" /> <folding /> </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/libs/json_tools.py"> + <entry file="file://$PROJECT_DIR$/entities/person.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="85"> - <caret line="5" column="34" selection-start-line="5" selection-start-column="34" selection-end-line="5" selection-end-column="34" /> + <state relative-caret-position="68"> + <caret line="6" column="55" selection-start-line="6" selection-start-column="55" selection-end-line="6" selection-end-column="55" /> <folding> - <marker date="1474549999557" expanded="true" signature="69:104" ph="..." /> - <marker date="1474549999557" expanded="true" signature="128:189" ph="..." /> + <element signature="e#193#2270#0" expanded="false" /> + <element signature="e#193#478#1" expanded="false" /> + <element signature="e#599#676#0" expanded="false" /> + <element signature="e#703#775#0" expanded="false" /> + <element signature="e#827#1574#1" expanded="false" /> + <element signature="e#1648#2152#0" expanded="false" /> + <element signature="e#1736#2085#0" expanded="false" /> + <element signature="e#1985#2085#0" expanded="false" /> + <element signature="e#2378#2724#0" expanded="false" /> </folding> </state> </provider> @@ -883,17 +920,17 @@ </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/entities/location.py"> + <entry file="file://$PROJECT_DIR$/parsers/INL_xml_parser.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="170"> - <caret line="10" column="27" selection-start-line="10" selection-start-column="27" selection-end-line="10" selection-end-column="27" /> + <state relative-caret-position="216"> + <caret line="19" column="15" selection-start-line="19" selection-start-column="15" selection-end-line="19" selection-end-column="15" /> <folding /> </state> </provider> </entry> <entry file="file://$PROJECT_DIR$/factories/INL_factory.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="263"> + <state relative-caret-position="365"> <caret line="113" column="44" selection-start-line="113" selection-start-column="44" selection-end-line="113" selection-end-column="44" /> <folding> <element signature="e#0#15#0" expanded="true" /> @@ -904,13 +941,24 @@ </entry> <entry file="file://$PROJECT_DIR$/readers/xml_reader.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="136"> - <caret line="26" column="38" selection-start-line="26" selection-start-column="38" selection-end-line="26" selection-end-column="38" /> + <state relative-caret-position="301"> + <caret line="36" column="38" selection-start-line="36" selection-start-column="38" selection-end-line="36" selection-end-column="38" /> <folding> <element signature="e#42#53#0" expanded="true" /> </folding> </state> </provider> </entry> + <entry file="file://$PROJECT_DIR$/entities/location.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="99"> + <caret line="6" column="64" selection-start-line="6" selection-start-column="64" selection-end-line="6" selection-end-column="64" /> + <folding> + <element signature="e#0#11#0" expanded="true" /> + <element signature="e#174#321#0" expanded="false" /> + </folding> + </state> + </provider> + </entry> </component> </project>
\ No newline at end of file diff --git a/entities/location.py b/entities/location.py index 064b193..a43eb8d 100644 --- a/entities/location.py +++ b/entities/location.py @@ -1,3 +1,5 @@ +import json + from entities.basic_entity import BasicEntity @@ -8,8 +10,16 @@ class Location(BasicEntity): self.name_in_langs = name_in_langs self.comments_list = comments_list + CSV_FIELDS = ["name", "comments"] + TYPE = "LOCATION" + + def print_entity(self): print("Name = " + self.name) print("Name in langs = " + str(self.name_in_langs)) print("Types = " + str(self.types_of_place)) print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py index c6db584..b315aac 100644 --- a/entities/person.py +++ b/entities/person.py @@ -1,3 +1,5 @@ +import json + from entities.basic_entity import BasicEntity @@ -57,6 +59,8 @@ class Person(BasicEntity): self.comments_list = comments_list self.profession = profession + CSV_FIELDS = ["name", "biodata", "comments"] + TYPE = 'PERSON' def print_entity(self): print("Name = " + self.name) @@ -66,3 +70,7 @@ class Person(BasicEntity): print("Bio Data = " + str(self.bio_data)) print("Comments = " + str(self.comments_list)) print("Profession = " + str(self.profession)) + + def to_csv_dict(self): + return {'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 2aaf8c6..af80e25 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -2,14 +2,13 @@ import json import csv import parsers, factories +from entities import Person try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET -CSV_FIELDS = ["name", "biodata", "comments"] - def read_file(path, element_key): # get an iterable record_counter = 0 @@ -23,41 +22,37 @@ def read_file(path, element_key): #the factory inl_factory = factories.INLFactory() - with open('out.csv', 'w', encoding='utf8') as f: - writer = csv.DictWriter(f, CSV_FIELDS) - writer.writeheader() - f667 = open("667.txt", 'w', encoding="utf8") - f678 = open("678.txt", 'w', encoding="utf8") - for event, element in context: - if 'end' in event: - if element_key in element.tag: - #enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - - #test print the entity - if entity != None: - json_entity = entity.to_json() - print(json_entity) - writer.writerow({'name': entity.name, 'biodata': entity.bio_data, 'comments': json.dumps(entity.comments_list, ensure_ascii=False)}) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - - #TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) - element.clear() - f667.close() - f678.close() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + #enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + #test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + + #TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) + element.clear() if __name__ == '__main__': read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') |