From d646c9a42273e98c85602f5618598125007bbfaa Mon Sep 17 00:00:00 2001 From: Tzafrir Cohen Date: Sun, 25 Sep 2016 20:28:16 +0300 Subject: WIP: commit all files that were changed --- .gitignore | 274 ++++++++++++++++++++++----------------------- .idea/misc.xml | 10 ++ entities/__init__.py | 4 +- entities/basic_entity.py | 10 +- entities/institution.py | 12 +- entities/location.py | 50 ++++----- entities/person.py | 152 ++++++++++++------------- factories/INL_factory.py | 260 +++++++++++++++++++++--------------------- factories/__init__.py | 2 +- factories/basic_factory.py | 6 +- libs/json_tools.py | 18 +-- parsers/INL_xml_parser.py | 72 ++++++------ parsers/__init__.py | 2 +- parsers/basic_parser.py | 12 +- readers/xml_reader.py | 122 ++++++++++---------- testers/factorytester.py | 42 +++---- writers/wd_writer.py | 13 ++- 17 files changed, 536 insertions(+), 525 deletions(-) diff --git a/.gitignore b/.gitignore index 7ebdd82..7c59bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,138 +1,138 @@ -# Created by .ignore support plugin (hsz.mobi) -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/workspace.xml -.idea/tasks.xml -.idea/dictionaries -.idea/vcs.xml -.idea/jsLibraryMappings.xml - -# Sensitive or high-churn files: -.idea/dataSources.ids -.idea/dataSources.xml -.idea/dataSources.local.xml -.idea/sqlDataSources.xml -.idea/dynamic.xml -.idea/uiDesigner.xml - -# Gradle: -.idea/gradle.xml -.idea/libraries - -# Mongo Explorer plugin: -.idea/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -venv/ -ENV/ - -# Spyder project settings -.spyderproject - -# Rope project settings -.ropeproject - +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/workspace.xml +.idea/tasks.xml +.idea/dictionaries +.idea/vcs.xml +.idea/jsLibraryMappings.xml + +# Sensitive or high-churn files: +.idea/dataSources.ids +.idea/dataSources.xml +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + .out/* \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index d3cc99c..84919a4 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/entities/__init__.py b/entities/__init__.py index 907ef4d..701846e 100644 --- a/entities/__init__.py +++ b/entities/__init__.py @@ -1,3 +1,3 @@ -from entities.person import Person -from entities.institution import Institution +from entities.person import Person +from entities.institution import Institution from entities.location import Location \ No newline at end of file diff --git a/entities/basic_entity.py b/entities/basic_entity.py index 9181422..9e8f11b 100644 --- a/entities/basic_entity.py +++ b/entities/basic_entity.py @@ -1,5 +1,5 @@ -from libs import JsonSerializable - - -class BasicEntity(JsonSerializable): - pass +from libs import JsonSerializable + + +class BasicEntity(JsonSerializable): + pass diff --git a/entities/institution.py b/entities/institution.py index 4538207..6be86fc 100644 --- a/entities/institution.py +++ b/entities/institution.py @@ -1,6 +1,6 @@ -from entities.basic_entity import BasicEntity - - -class Institution(BasicEntity): - def __init__(self): - raise NotImplementedError() +from entities.basic_entity import BasicEntity + + +class Institution(BasicEntity): + def __init__(self): + raise NotImplementedError() diff --git a/entities/location.py b/entities/location.py index a43eb8d..f782e1f 100644 --- a/entities/location.py +++ b/entities/location.py @@ -1,25 +1,25 @@ -import json - -from entities.basic_entity import BasicEntity - - -class Location(BasicEntity): - def __init__(self, name, types_of_place, name_in_langs, comments_list): - self.name = name - self.types_of_place = types_of_place - self.name_in_langs = name_in_langs - self.comments_list = comments_list - - CSV_FIELDS = ["name", "comments"] - TYPE = "LOCATION" - - - def print_entity(self): - print("Name = " + self.name) - print("Name in langs = " + str(self.name_in_langs)) - print("Types = " + str(self.types_of_place)) - print("Comments = " + str(self.comments_list)) - - def to_csv_dict(self): - return {'name': self.name, - 'comments': json.dumps(self.comments_list, ensure_ascii=False)} +import json + +from entities.basic_entity import BasicEntity + + +class Location(BasicEntity): + def __init__(self, name, types_of_place, name_in_langs, comments_list): + self.name = name + self.types_of_place = types_of_place + self.name_in_langs = name_in_langs + self.comments_list = comments_list + + CSV_FIELDS = ["name", "comments"] + TYPE = "LOCATION" + + + def print_entity(self): + print("Name = " + self.name) + print("Name in langs = " + str(self.name_in_langs)) + print("Types = " + str(self.types_of_place)) + print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py index b315aac..a5aa396 100644 --- a/entities/person.py +++ b/entities/person.py @@ -1,76 +1,76 @@ -import json - -from entities.basic_entity import BasicEntity - - -class Person(BasicEntity): - def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession): - """ - - :param name: - :param date_of_birth: - :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: - { - "latin": "George" - "heb": "[george in hebrew]" - } - """ - self.name = name - years_parts = date_of_birth.split('-') - if (len(years_parts) == 2): - self.birth_year = years_parts[0] - self.death_year = years_parts[1] - else: - self.birth_year = date_of_birth.strip() - self.death_year = '' - self.name_in_langs = name_in_langs - ''' - place_of_birth = list() - place_of_death = list() - profession = list() - for comment in bio_data: - encoded_comment = ''.join(comment).strip() - if encoded_comment.startswith(u"מקום לידה: "): - place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2]) - if encoded_comment.startswith(u"מקום פטירה: "): - place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2]) - if encoded_comment.startswith(u"מקצוע: "): - profession.append(encoded_comment.partition(u"מקום פטירה: ")[2]) - - self.place_of_birth = place_of_birth - self.place_of_death = place_of_death - self.profession = profession - ''' - bio_data_dict = dict() - for elem in bio_data: - elem_splitted = elem.split(":") - if len(elem_splitted) == 2: - bio_data_key = elem_splitted[0] - bio_data_value = elem_splitted[1] - if bio_data_key in bio_data_dict: - bio_data_dict.get(bio_data_key).append(bio_data_value) - else: - bio_data_dict.update( - {bio_data_key: [bio_data_value]} - ) - else: - bio_data_dict.update({elem: ''}) - self.bio_data = bio_data_dict - self.comments_list = comments_list - self.profession = profession - - CSV_FIELDS = ["name", "biodata", "comments"] - TYPE = 'PERSON' - - def print_entity(self): - print("Name = " + self.name) - print("Birth year = " + self.birth_year) - print("Death year = " + self.death_year) - print("Names in langs = " + str(self.name_in_langs)) - print("Bio Data = " + str(self.bio_data)) - print("Comments = " + str(self.comments_list)) - print("Profession = " + str(self.profession)) - - def to_csv_dict(self): - return {'name': self.name, 'biodata': self.bio_data, - 'comments': json.dumps(self.comments_list, ensure_ascii=False)} +import json + +from entities.basic_entity import BasicEntity + + +class Person(BasicEntity): + def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession): + """ + + :param name: + :param date_of_birth: + :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: + { + "latin": "George" + "heb": "[george in hebrew]" + } + """ + self.name = name + years_parts = date_of_birth.split('-') + if (len(years_parts) == 2): + self.birth_year = years_parts[0] + self.death_year = years_parts[1] + else: + self.birth_year = date_of_birth.strip() + self.death_year = '' + self.name_in_langs = name_in_langs + ''' + place_of_birth = list() + place_of_death = list() + profession = list() + for comment in bio_data: + encoded_comment = ''.join(comment).strip() + if encoded_comment.startswith(u"מקום לידה: "): + place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2]) + if encoded_comment.startswith(u"מקום פטירה: "): + place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + if encoded_comment.startswith(u"מקצוע: "): + profession.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + + self.place_of_birth = place_of_birth + self.place_of_death = place_of_death + self.profession = profession + ''' + bio_data_dict = dict() + for elem in bio_data: + elem_splitted = elem.split(":") + if len(elem_splitted) == 2: + bio_data_key = elem_splitted[0] + bio_data_value = elem_splitted[1] + if bio_data_key in bio_data_dict: + bio_data_dict.get(bio_data_key).append(bio_data_value) + else: + bio_data_dict.update( + {bio_data_key: [bio_data_value]} + ) + else: + bio_data_dict.update({elem: ''}) + self.bio_data = bio_data_dict + self.comments_list = comments_list + self.profession = profession + + CSV_FIELDS = ["name", "biodata", "comments"] + TYPE = 'PERSON' + + def print_entity(self): + print("Name = " + self.name) + print("Birth year = " + self.birth_year) + print("Death year = " + self.death_year) + print("Names in langs = " + str(self.name_in_langs)) + print("Bio Data = " + str(self.bio_data)) + print("Comments = " + str(self.comments_list)) + print("Profession = " + str(self.profession)) + + def to_csv_dict(self): + return {'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/factories/INL_factory.py b/factories/INL_factory.py index f4e494f..286762a 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -1,130 +1,130 @@ -import entities -from factories import BasicFactory -import xml.etree.cElementTree as ET - -TAG_TO_ENTITY_MAPPING = { - '100': entities.Person, - '110': entities.Institution, - '151': entities.Location -} - - -ENTITY_KEYS = { - '100.a': 'name', - '100.9': 'name_langindic', - '100.d': 'date_of_birth', - '400.a': 'name_in_langs', - '400.9': 'langs_langindic', - '678.a': 'bio_data', - '151.a': 'name', - '151.9': 'name_langindic', - '451:a': 'name_in_langs', - '451:9': 'langs_langindic', - '550.a': 'type_of_place', - '667.a': 'comment', - '374.a': 'profession' -} - - -def get_record_key(record): - root = record.getroot() - for field in root: - field_tag = field.attrib.get('tag') - if '100' in field_tag: - return '100' - if '151' in field_tag: - return '151' - if '110' in field_tag: - return '110' - -class INLFactory(BasicFactory): - def __init__(self, tag_to_entity_mapping=None): - self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING - - def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): - record_key = get_record_key(raw_object) - #100 is person - if record_key == '100': - name = '' - name_in_langs = dict() - bio_data = list() - comment_list = list() - eng_name = '' - date_of_birth = '' - profession = list() - name_diff = '' - #get the names and date of birth and bio data - for field in raw_object.getroot(): - key = field.attrib.get('tag') - tag = entity_keys.get(key) - if tag == 'name': - name = field.text - elif tag == 'name_langindic': - # chack if this english name - if field.text == 'lat': - eng_name = name - # else add it to name_in_langs - else: - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name) - else: - name_in_langs.update({field.text: [name]}) - elif tag == 'date_of_birth': - date_of_birth = field.text - elif tag == 'name_in_langs': - name_diff = field.text - elif tag == 'langs_langindic': - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name_diff) - else: - name_in_langs.update({field.text: [name_diff]}) - elif tag == 'bio_data': - bio_data.append(field.text) - elif tag == 'comment': - comment_list.append(field.text) - elif tag == 'profession': - profession.append(field.text) - return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession) - #110 is institue - elif record_key == '110': - return entities.Institution() - #151 is location - elif record_key == '151': - name_in_langs = dict() - types_of_place = list() - comment_list = list() - eng_name = '' - name_diff = '' - - for field in raw_object.getroot(): - key = field.attrib.get('tag') - tag = entity_keys.get(key) - if tag == 'name': - name = field.text - elif tag == 'name_langindic': - # chack if this english name - if field.text == 'lat': - eng_name = name - # else add it to name_in_langs - else: - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name) - else: - name_in_langs.update({field.text: [name]}) - elif tag == 'type_of_place': - types_of_place.append(field.text) - elif tag == 'name_in_langs': - name_diff = field.text - elif tag == 'langs_langindic': - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name_diff) - else: - name_in_langs.update({field.text: [name_diff]}) - elif tag == 'comment': - comment_list.append(field.text) - return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) - else: - return None - # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) - - +import entities +from factories import BasicFactory +import xml.etree.cElementTree as ET + +TAG_TO_ENTITY_MAPPING = { + '100': entities.Person, + '110': entities.Institution, + '151': entities.Location +} + + +ENTITY_KEYS = { + '100.a': 'name', + '100.9': 'name_langindic', + '100.d': 'date_of_birth', + '400.a': 'name_in_langs', + '400.9': 'langs_langindic', + '678.a': 'bio_data', + '151.a': 'name', + '151.9': 'name_langindic', + '451:a': 'name_in_langs', + '451:9': 'langs_langindic', + '550.a': 'type_of_place', + '667.a': 'comment', + '374.a': 'profession' +} + + +def get_record_key(record): + root = record.getroot() + for field in root: + field_tag = field.attrib.get('tag') + if '100' in field_tag: + return '100' + if '151' in field_tag: + return '151' + if '110' in field_tag: + return '110' + +class INLFactory(BasicFactory): + def __init__(self, tag_to_entity_mapping=None): + self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING + + def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): + record_key = get_record_key(raw_object) + #100 is person + if record_key == '100': + name = '' + name_in_langs = dict() + bio_data = list() + comment_list = list() + eng_name = '' + date_of_birth = '' + profession = list() + name_diff = '' + #get the names and date of birth and bio data + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'date_of_birth': + date_of_birth = field.text + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'bio_data': + bio_data.append(field.text) + elif tag == 'comment': + comment_list.append(field.text) + elif tag == 'profession': + profession.append(field.text) + return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession) + #110 is institue + elif record_key == '110': + return entities.Institution() + #151 is location + elif record_key == '151': + name_in_langs = dict() + types_of_place = list() + comment_list = list() + eng_name = '' + name_diff = '' + + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'type_of_place': + types_of_place.append(field.text) + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'comment': + comment_list.append(field.text) + return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) + else: + return None + # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) + + diff --git a/factories/__init__.py b/factories/__init__.py index 86901f5..947845c 100644 --- a/factories/__init__.py +++ b/factories/__init__.py @@ -1,2 +1,2 @@ -from factories.basic_factory import BasicFactory +from factories.basic_factory import BasicFactory from factories.INL_factory import INLFactory \ No newline at end of file diff --git a/factories/basic_factory.py b/factories/basic_factory.py index 1715846..1974d65 100644 --- a/factories/basic_factory.py +++ b/factories/basic_factory.py @@ -1,3 +1,3 @@ -class BasicFactory(object): - def get_entity(self, entity_key, raw_object): - raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self))) +class BasicFactory(object): + def get_entity(self, entity_key, raw_object): + raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self))) diff --git a/libs/json_tools.py b/libs/json_tools.py index 5e78d23..9ce19b0 100644 --- a/libs/json_tools.py +++ b/libs/json_tools.py @@ -1,9 +1,9 @@ -import json - - -class JsonSerializable(object): - def __repr__(self): - return str(self.to_json()) - - def to_json(self): - return json.dumps(self.__dict__, ensure_ascii=False) +import json + + +class JsonSerializable(object): + def __repr__(self): + return str(self.to_json()) + + def to_json(self): + return json.dumps(self.__dict__, ensure_ascii=False) diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 1a06f6b..cdde5a8 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -1,36 +1,36 @@ -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -KNOWN_FIELD_TAGS = ['100', '110', '151'] - -TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374'] - -class INLXmlParser: - def __init__(self, reader, whitelist=TAG_WHITELIST): - self.reader = reader - #self.whitelist = whitelist or KNOWN_FIELD_TAGS - self.whitelist = whitelist - - def clearxml(self): - - # # scan the datafields in the records and copy to the new one only the tags in the whitelist - # for record in root: # create new record - newRecord = ET.Element('record') - for field in self.reader: - fieldtag = field.attrib.get('tag') - if fieldtag in self.whitelist: - temptag = fieldtag - # tag 700 and 400 are the same - if temptag == '700': - temptag = '400' - for data in field: - newFieldTag = temptag - newFieldTag += '.' - newFieldTag += data.attrib.get('code') - newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) - newTag.text = data.text - - newRecordTree = ET.ElementTree(newRecord) - return ET.ElementTree(newRecord) +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] + +TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374'] + +class INLXmlParser: + def __init__(self, reader, whitelist=TAG_WHITELIST): + self.reader = reader + #self.whitelist = whitelist or KNOWN_FIELD_TAGS + self.whitelist = whitelist + + def clearxml(self): + + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.Element('record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + temptag = fieldtag + # tag 700 and 400 are the same + if temptag == '700': + temptag = '400' + for data in field: + newFieldTag = temptag + newFieldTag += '.' + newFieldTag += data.attrib.get('code') + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + newTag.text = data.text + + newRecordTree = ET.ElementTree(newRecord) + return ET.ElementTree(newRecord) diff --git a/parsers/__init__.py b/parsers/__init__.py index d32c917..07907f9 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -1,2 +1,2 @@ - + from .INL_xml_parser import INLXmlParser \ No newline at end of file diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py index dae19cb..32c1b43 100644 --- a/parsers/basic_parser.py +++ b/parsers/basic_parser.py @@ -1,6 +1,6 @@ -class BasicParser(object): - def __init__(self): - pass - - def parse(self, data): - raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) +class BasicParser(object): + def __init__(self): + pass + + def parse(self, data): + raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 5b2d1fd..710899d 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import -import json -import csv -import parsers, factories -from entities import Person - -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -def read_file(path, element_key): - # get an iterable - record_counter = 0 - context = ET.iterparse(path, events=("start", "end")) - - # turn it into an iterator - context = iter(context) - - # get the root element - event, root = context.__next__() - - # the factory - inl_factory = factories.INLFactory() - files = {} - for event, element in context: - if 'end' in event: - if element_key in element.tag: - # enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - # test print the entity - if entity != None: - if entity.TYPE not in files: - files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') - json_entity = entity.to_json() - print(json_entity) - writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) - writer.writerow(entity.to_csv_dict()) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - # TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', - cleaned_element.getroot().text) - element.clear() - print(record_counter) - - -if __name__ == '__main__': - read_file(r"../../NLI-nnl10.xml", 'record') +# from __future__ import absolute_import +import json +import csv +import parsers, factories +from entities import Person + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + # the factory + inl_factory = factories.INLFactory() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + # enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + # test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + # TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', + cleaned_element.getroot().text) + element.clear() + print(record_counter) + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') diff --git a/testers/factorytester.py b/testers/factorytester.py index 88e660d..55ebe7c 100644 --- a/testers/factorytester.py +++ b/testers/factorytester.py @@ -1,21 +1,21 @@ -from __future__ import absolute_import -import parsers -import factories -import xml.etree.cElementTree as ET - -xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml" -whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] - - -xmltree = ET.parse(xmlpath) -entities = list() -inl_factory = factories.INLFactory() - -for record in xmltree.getroot(): - inl_parser = parsers.INLXmlParser(record, whitelist) - clean_record = inl_parser.clearxml() - entities.append(inl_factory.get_entity(clean_record)) - -for entity in entities: - print(entity) - +from __future__ import absolute_import +import parsers +import factories +import xml.etree.cElementTree as ET + +xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml" +whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] + + +xmltree = ET.parse(xmlpath) +entities = list() +inl_factory = factories.INLFactory() + +for record in xmltree.getroot(): + inl_parser = parsers.INLXmlParser(record, whitelist) + clean_record = inl_parser.clearxml() + entities.append(inl_factory.get_entity(clean_record)) + +for entity in entities: + print(entity) + diff --git a/writers/wd_writer.py b/writers/wd_writer.py index b88833f..4a456e5 100644 --- a/writers/wd_writer.py +++ b/writers/wd_writer.py @@ -1,6 +1,7 @@ -import pywikibot -from pywikibot import pagegenerators, WikidataBot - -class WDWriter(object): - def __init__(self): - pass \ No newline at end of file +import pywikibot +from pywikibot import pagegenerators, WikidataBot + + +class WDWriter(object): + def __init__(self, source_path, reader, factory): + self.source_path = source_path -- cgit v1.2.3