diff options
author | Tzafrir Cohen <tzafrir@cohens.org.il> | 2016-09-25 20:28:16 +0300 |
---|---|---|
committer | Tzafrir Cohen <tzafrir@cohens.org.il> | 2016-09-25 20:28:16 +0300 |
commit | d646c9a42273e98c85602f5618598125007bbfaa (patch) | |
tree | d6173d42320f0d4fe702581f34e5c1304eb12089 | |
parent | 41125eb195324d18d9c2c12aa12ecbf66dc5d495 (diff) |
WIP: commit all files that were changed
-rw-r--r-- | .gitignore | 274 | ||||
-rw-r--r-- | .idea/misc.xml | 10 | ||||
-rw-r--r-- | entities/__init__.py | 4 | ||||
-rw-r--r-- | entities/basic_entity.py | 10 | ||||
-rw-r--r-- | entities/institution.py | 12 | ||||
-rw-r--r-- | entities/location.py | 50 | ||||
-rw-r--r-- | entities/person.py | 152 | ||||
-rw-r--r-- | factories/INL_factory.py | 260 | ||||
-rw-r--r-- | factories/__init__.py | 2 | ||||
-rw-r--r-- | factories/basic_factory.py | 6 | ||||
-rw-r--r-- | libs/json_tools.py | 18 | ||||
-rw-r--r-- | parsers/INL_xml_parser.py | 72 | ||||
-rw-r--r-- | parsers/__init__.py | 2 | ||||
-rw-r--r-- | parsers/basic_parser.py | 12 | ||||
-rw-r--r-- | readers/xml_reader.py | 122 | ||||
-rw-r--r-- | testers/factorytester.py | 42 | ||||
-rw-r--r-- | writers/wd_writer.py | 13 |
17 files changed, 536 insertions, 525 deletions
@@ -1,138 +1,138 @@ -# Created by .ignore support plugin (hsz.mobi) -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/workspace.xml -.idea/tasks.xml -.idea/dictionaries -.idea/vcs.xml -.idea/jsLibraryMappings.xml - -# Sensitive or high-churn files: -.idea/dataSources.ids -.idea/dataSources.xml -.idea/dataSources.local.xml -.idea/sqlDataSources.xml -.idea/dynamic.xml -.idea/uiDesigner.xml - -# Gradle: -.idea/gradle.xml -.idea/libraries - -# Mongo Explorer plugin: -.idea/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -venv/ -ENV/ - -# Spyder project settings -.spyderproject - -# Rope project settings -.ropeproject - +# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/vcs.xml
+.idea/jsLibraryMappings.xml
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources.local.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
.out/*
\ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index d3cc99c..84919a4 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,14 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> + <component name="ProjectLevelVcsManager" settingsEditedManually="false"> + <OptionsSetting value="true" id="Add" /> + <OptionsSetting value="true" id="Remove" /> + <OptionsSetting value="true" id="Checkout" /> + <OptionsSetting value="true" id="Update" /> + <OptionsSetting value="true" id="Status" /> + <OptionsSetting value="true" id="Edit" /> + <ConfirmationsSetting value="0" id="Add" /> + <ConfirmationsSetting value="0" id="Remove" /> + </component> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (C:\Program Files (x86)\Python35-32\python.exe)" project-jdk-type="Python SDK" /> </project>
\ No newline at end of file diff --git a/entities/__init__.py b/entities/__init__.py index 907ef4d..701846e 100644 --- a/entities/__init__.py +++ b/entities/__init__.py @@ -1,3 +1,3 @@ -from entities.person import Person -from entities.institution import Institution +from entities.person import Person
+from entities.institution import Institution
from entities.location import Location
\ No newline at end of file diff --git a/entities/basic_entity.py b/entities/basic_entity.py index 9181422..9e8f11b 100644 --- a/entities/basic_entity.py +++ b/entities/basic_entity.py @@ -1,5 +1,5 @@ -from libs import JsonSerializable - - -class BasicEntity(JsonSerializable): - pass +from libs import JsonSerializable
+
+
+class BasicEntity(JsonSerializable):
+ pass
diff --git a/entities/institution.py b/entities/institution.py index 4538207..6be86fc 100644 --- a/entities/institution.py +++ b/entities/institution.py @@ -1,6 +1,6 @@ -from entities.basic_entity import BasicEntity - - -class Institution(BasicEntity): - def __init__(self): - raise NotImplementedError() +from entities.basic_entity import BasicEntity
+
+
+class Institution(BasicEntity):
+ def __init__(self):
+ raise NotImplementedError()
diff --git a/entities/location.py b/entities/location.py index a43eb8d..f782e1f 100644 --- a/entities/location.py +++ b/entities/location.py @@ -1,25 +1,25 @@ -import json - -from entities.basic_entity import BasicEntity - - -class Location(BasicEntity): - def __init__(self, name, types_of_place, name_in_langs, comments_list): - self.name = name - self.types_of_place = types_of_place - self.name_in_langs = name_in_langs - self.comments_list = comments_list - - CSV_FIELDS = ["name", "comments"] - TYPE = "LOCATION" - - - def print_entity(self): - print("Name = " + self.name) - print("Name in langs = " + str(self.name_in_langs)) - print("Types = " + str(self.types_of_place)) - print("Comments = " + str(self.comments_list)) - - def to_csv_dict(self): - return {'name': self.name, - 'comments': json.dumps(self.comments_list, ensure_ascii=False)} +import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Location(BasicEntity):
+ def __init__(self, name, types_of_place, name_in_langs, comments_list):
+ self.name = name
+ self.types_of_place = types_of_place
+ self.name_in_langs = name_in_langs
+ self.comments_list = comments_list
+
+ CSV_FIELDS = ["name", "comments"]
+ TYPE = "LOCATION"
+
+
+ def print_entity(self):
+ print("Name = " + self.name)
+ print("Name in langs = " + str(self.name_in_langs))
+ print("Types = " + str(self.types_of_place))
+ print("Comments = " + str(self.comments_list))
+
+ def to_csv_dict(self):
+ return {'name': self.name,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/entities/person.py b/entities/person.py index b315aac..a5aa396 100644 --- a/entities/person.py +++ b/entities/person.py @@ -1,76 +1,76 @@ -import json - -from entities.basic_entity import BasicEntity - - -class Person(BasicEntity): - def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession): - """ - - :param name: - :param date_of_birth: - :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: - { - "latin": "George" - "heb": "[george in hebrew]" - } - """ - self.name = name - years_parts = date_of_birth.split('-') - if (len(years_parts) == 2): - self.birth_year = years_parts[0] - self.death_year = years_parts[1] - else: - self.birth_year = date_of_birth.strip() - self.death_year = '' - self.name_in_langs = name_in_langs - ''' - place_of_birth = list() - place_of_death = list() - profession = list() - for comment in bio_data: - encoded_comment = ''.join(comment).strip() - if encoded_comment.startswith(u"מקום לידה: "): - place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2]) - if encoded_comment.startswith(u"מקום פטירה: "): - place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2]) - if encoded_comment.startswith(u"מקצוע: "): - profession.append(encoded_comment.partition(u"מקום פטירה: ")[2]) - - self.place_of_birth = place_of_birth - self.place_of_death = place_of_death - self.profession = profession - ''' - bio_data_dict = dict() - for elem in bio_data: - elem_splitted = elem.split(":") - if len(elem_splitted) == 2: - bio_data_key = elem_splitted[0] - bio_data_value = elem_splitted[1] - if bio_data_key in bio_data_dict: - bio_data_dict.get(bio_data_key).append(bio_data_value) - else: - bio_data_dict.update( - {bio_data_key: [bio_data_value]} - ) - else: - bio_data_dict.update({elem: ''}) - self.bio_data = bio_data_dict - self.comments_list = comments_list - self.profession = profession - - CSV_FIELDS = ["name", "biodata", "comments"] - TYPE = 'PERSON' - - def print_entity(self): - print("Name = " + self.name) - print("Birth year = " + self.birth_year) - print("Death year = " + self.death_year) - print("Names in langs = " + str(self.name_in_langs)) - print("Bio Data = " + str(self.bio_data)) - print("Comments = " + str(self.comments_list)) - print("Profession = " + str(self.profession)) - - def to_csv_dict(self): - return {'name': self.name, 'biodata': self.bio_data, - 'comments': json.dumps(self.comments_list, ensure_ascii=False)} +import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Person(BasicEntity):
+ def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
+ """
+
+ :param name:
+ :param date_of_birth:
+ :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
+ {
+ "latin": "George"
+ "heb": "[george in hebrew]"
+ }
+ """
+ self.name = name
+ years_parts = date_of_birth.split('-')
+ if (len(years_parts) == 2):
+ self.birth_year = years_parts[0]
+ self.death_year = years_parts[1]
+ else:
+ self.birth_year = date_of_birth.strip()
+ self.death_year = ''
+ self.name_in_langs = name_in_langs
+ '''
+ place_of_birth = list()
+ place_of_death = list()
+ profession = list()
+ for comment in bio_data:
+ encoded_comment = ''.join(comment).strip()
+ if encoded_comment.startswith(u"מקום לידה: "):
+ place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
+ if encoded_comment.startswith(u"מקום פטירה: "):
+ place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+ if encoded_comment.startswith(u"מקצוע: "):
+ profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+
+ self.place_of_birth = place_of_birth
+ self.place_of_death = place_of_death
+ self.profession = profession
+ '''
+ bio_data_dict = dict()
+ for elem in bio_data:
+ elem_splitted = elem.split(":")
+ if len(elem_splitted) == 2:
+ bio_data_key = elem_splitted[0]
+ bio_data_value = elem_splitted[1]
+ if bio_data_key in bio_data_dict:
+ bio_data_dict.get(bio_data_key).append(bio_data_value)
+ else:
+ bio_data_dict.update(
+ {bio_data_key: [bio_data_value]}
+ )
+ else:
+ bio_data_dict.update({elem: ''})
+ self.bio_data = bio_data_dict
+ self.comments_list = comments_list
+ self.profession = profession
+
+ CSV_FIELDS = ["name", "biodata", "comments"]
+ TYPE = 'PERSON'
+
+ def print_entity(self):
+ print("Name = " + self.name)
+ print("Birth year = " + self.birth_year)
+ print("Death year = " + self.death_year)
+ print("Names in langs = " + str(self.name_in_langs))
+ print("Bio Data = " + str(self.bio_data))
+ print("Comments = " + str(self.comments_list))
+ print("Profession = " + str(self.profession))
+
+ def to_csv_dict(self):
+ return {'name': self.name, 'biodata': self.bio_data,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/factories/INL_factory.py b/factories/INL_factory.py index f4e494f..286762a 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -1,130 +1,130 @@ -import entities -from factories import BasicFactory -import xml.etree.cElementTree as ET - -TAG_TO_ENTITY_MAPPING = { - '100': entities.Person, - '110': entities.Institution, - '151': entities.Location -} - - -ENTITY_KEYS = { - '100.a': 'name', - '100.9': 'name_langindic', - '100.d': 'date_of_birth', - '400.a': 'name_in_langs', - '400.9': 'langs_langindic', - '678.a': 'bio_data', - '151.a': 'name', - '151.9': 'name_langindic', - '451:a': 'name_in_langs', - '451:9': 'langs_langindic', - '550.a': 'type_of_place', - '667.a': 'comment', - '374.a': 'profession' -} - - -def get_record_key(record): - root = record.getroot() - for field in root: - field_tag = field.attrib.get('tag') - if '100' in field_tag: - return '100' - if '151' in field_tag: - return '151' - if '110' in field_tag: - return '110' - -class INLFactory(BasicFactory): - def __init__(self, tag_to_entity_mapping=None): - self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING - - def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): - record_key = get_record_key(raw_object) - #100 is person - if record_key == '100': - name = '' - name_in_langs = dict() - bio_data = list() - comment_list = list() - eng_name = '' - date_of_birth = '' - profession = list() - name_diff = '' - #get the names and date of birth and bio data - for field in raw_object.getroot(): - key = field.attrib.get('tag') - tag = entity_keys.get(key) - if tag == 'name': - name = field.text - elif tag == 'name_langindic': - # chack if this english name - if field.text == 'lat': - eng_name = name - # else add it to name_in_langs - else: - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name) - else: - name_in_langs.update({field.text: [name]}) - elif tag == 'date_of_birth': - date_of_birth = field.text - elif tag == 'name_in_langs': - name_diff = field.text - elif tag == 'langs_langindic': - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name_diff) - else: - name_in_langs.update({field.text: [name_diff]}) - elif tag == 'bio_data': - bio_data.append(field.text) - elif tag == 'comment': - comment_list.append(field.text) - elif tag == 'profession': - profession.append(field.text) - return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession) - #110 is institue - elif record_key == '110': - return entities.Institution() - #151 is location - elif record_key == '151': - name_in_langs = dict() - types_of_place = list() - comment_list = list() - eng_name = '' - name_diff = '' - - for field in raw_object.getroot(): - key = field.attrib.get('tag') - tag = entity_keys.get(key) - if tag == 'name': - name = field.text - elif tag == 'name_langindic': - # chack if this english name - if field.text == 'lat': - eng_name = name - # else add it to name_in_langs - else: - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name) - else: - name_in_langs.update({field.text: [name]}) - elif tag == 'type_of_place': - types_of_place.append(field.text) - elif tag == 'name_in_langs': - name_diff = field.text - elif tag == 'langs_langindic': - if field.text in name_in_langs: - name_in_langs.get(field.text).append(name_diff) - else: - name_in_langs.update({field.text: [name_diff]}) - elif tag == 'comment': - comment_list.append(field.text) - return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) - else: - return None - # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) - - +import entities
+from factories import BasicFactory
+import xml.etree.cElementTree as ET
+
+TAG_TO_ENTITY_MAPPING = {
+ '100': entities.Person,
+ '110': entities.Institution,
+ '151': entities.Location
+}
+
+
+ENTITY_KEYS = {
+ '100.a': 'name',
+ '100.9': 'name_langindic',
+ '100.d': 'date_of_birth',
+ '400.a': 'name_in_langs',
+ '400.9': 'langs_langindic',
+ '678.a': 'bio_data',
+ '151.a': 'name',
+ '151.9': 'name_langindic',
+ '451:a': 'name_in_langs',
+ '451:9': 'langs_langindic',
+ '550.a': 'type_of_place',
+ '667.a': 'comment',
+ '374.a': 'profession'
+}
+
+
+def get_record_key(record):
+ root = record.getroot()
+ for field in root:
+ field_tag = field.attrib.get('tag')
+ if '100' in field_tag:
+ return '100'
+ if '151' in field_tag:
+ return '151'
+ if '110' in field_tag:
+ return '110'
+
+class INLFactory(BasicFactory):
+ def __init__(self, tag_to_entity_mapping=None):
+ self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
+
+ def get_entity(self, raw_object, entity_keys=ENTITY_KEYS):
+ record_key = get_record_key(raw_object)
+ #100 is person
+ if record_key == '100':
+ name = ''
+ name_in_langs = dict()
+ bio_data = list()
+ comment_list = list()
+ eng_name = ''
+ date_of_birth = ''
+ profession = list()
+ name_diff = ''
+ #get the names and date of birth and bio data
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'date_of_birth':
+ date_of_birth = field.text
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'bio_data':
+ bio_data.append(field.text)
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ elif tag == 'profession':
+ profession.append(field.text)
+ return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
+ #110 is institue
+ elif record_key == '110':
+ return entities.Institution()
+ #151 is location
+ elif record_key == '151':
+ name_in_langs = dict()
+ types_of_place = list()
+ comment_list = list()
+ eng_name = ''
+ name_diff = ''
+
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'type_of_place':
+ types_of_place.append(field.text)
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
+ else:
+ return None
+ # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
+
+
diff --git a/factories/__init__.py b/factories/__init__.py index 86901f5..947845c 100644 --- a/factories/__init__.py +++ b/factories/__init__.py @@ -1,2 +1,2 @@ -from factories.basic_factory import BasicFactory +from factories.basic_factory import BasicFactory
from factories.INL_factory import INLFactory
\ No newline at end of file diff --git a/factories/basic_factory.py b/factories/basic_factory.py index 1715846..1974d65 100644 --- a/factories/basic_factory.py +++ b/factories/basic_factory.py @@ -1,3 +1,3 @@ -class BasicFactory(object): - def get_entity(self, entity_key, raw_object): - raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self))) +class BasicFactory(object):
+ def get_entity(self, entity_key, raw_object):
+ raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
diff --git a/libs/json_tools.py b/libs/json_tools.py index 5e78d23..9ce19b0 100644 --- a/libs/json_tools.py +++ b/libs/json_tools.py @@ -1,9 +1,9 @@ -import json - - -class JsonSerializable(object): - def __repr__(self): - return str(self.to_json()) - - def to_json(self): - return json.dumps(self.__dict__, ensure_ascii=False) +import json
+
+
+class JsonSerializable(object):
+ def __repr__(self):
+ return str(self.to_json())
+
+ def to_json(self):
+ return json.dumps(self.__dict__, ensure_ascii=False)
diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 1a06f6b..cdde5a8 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -1,36 +1,36 @@ -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -KNOWN_FIELD_TAGS = ['100', '110', '151'] - -TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374'] - -class INLXmlParser: - def __init__(self, reader, whitelist=TAG_WHITELIST): - self.reader = reader - #self.whitelist = whitelist or KNOWN_FIELD_TAGS - self.whitelist = whitelist - - def clearxml(self): - - # # scan the datafields in the records and copy to the new one only the tags in the whitelist - # for record in root: # create new record - newRecord = ET.Element('record') - for field in self.reader: - fieldtag = field.attrib.get('tag') - if fieldtag in self.whitelist: - temptag = fieldtag - # tag 700 and 400 are the same - if temptag == '700': - temptag = '400' - for data in field: - newFieldTag = temptag - newFieldTag += '.' - newFieldTag += data.attrib.get('code') - newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) - newTag.text = data.text - - newRecordTree = ET.ElementTree(newRecord) - return ET.ElementTree(newRecord) +try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
+
+class INLXmlParser:
+ def __init__(self, reader, whitelist=TAG_WHITELIST):
+ self.reader = reader
+ #self.whitelist = whitelist or KNOWN_FIELD_TAGS
+ self.whitelist = whitelist
+
+ def clearxml(self):
+
+ # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+ # for record in root: # create new record
+ newRecord = ET.Element('record')
+ for field in self.reader:
+ fieldtag = field.attrib.get('tag')
+ if fieldtag in self.whitelist:
+ temptag = fieldtag
+ # tag 700 and 400 are the same
+ if temptag == '700':
+ temptag = '400'
+ for data in field:
+ newFieldTag = temptag
+ newFieldTag += '.'
+ newFieldTag += data.attrib.get('code')
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+ newTag.text = data.text
+
+ newRecordTree = ET.ElementTree(newRecord)
+ return ET.ElementTree(newRecord)
diff --git a/parsers/__init__.py b/parsers/__init__.py index d32c917..07907f9 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -1,2 +1,2 @@ - +
from .INL_xml_parser import INLXmlParser
\ No newline at end of file diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py index dae19cb..32c1b43 100644 --- a/parsers/basic_parser.py +++ b/parsers/basic_parser.py @@ -1,6 +1,6 @@ -class BasicParser(object): - def __init__(self): - pass - - def parse(self, data): - raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) +class BasicParser(object):
+ def __init__(self):
+ pass
+
+ def parse(self, data):
+ raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 5b2d1fd..710899d 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import -import json -import csv -import parsers, factories -from entities import Person - -try: - import xml.etree.cElementTree as ET -except ImportError: - import xml.etree.ElementTree as ET - -def read_file(path, element_key): - # get an iterable - record_counter = 0 - context = ET.iterparse(path, events=("start", "end")) - - # turn it into an iterator - context = iter(context) - - # get the root element - event, root = context.__next__() - - # the factory - inl_factory = factories.INLFactory() - files = {} - for event, element in context: - if 'end' in event: - if element_key in element.tag: - # enter the processing here - record_counter += 1 - - #cleaned element is a tree - inl_parser = parsers.INLXmlParser(element) - cleaned_element = inl_parser.clearxml() - entity = inl_factory.get_entity(cleaned_element) - - # test print the entity - if entity != None: - if entity.TYPE not in files: - files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') - json_entity = entity.to_json() - print(json_entity) - writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) - writer.writerow(entity.to_csv_dict()) - # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) - # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) - - # entity.print_entity() - - # TODO analys and upload the entity - - - # import pdb; pdb.set_trace() - print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', - cleaned_element.getroot().text) - element.clear() - print(record_counter) - - -if __name__ == '__main__': - read_file(r"../../NLI-nnl10.xml", 'record') +# from __future__ import absolute_import
+import json
+import csv
+import parsers, factories
+from entities import Person
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+def read_file(path, element_key):
+ # get an iterable
+ record_counter = 0
+ context = ET.iterparse(path, events=("start", "end"))
+
+ # turn it into an iterator
+ context = iter(context)
+
+ # get the root element
+ event, root = context.__next__()
+
+ # the factory
+ inl_factory = factories.INLFactory()
+ files = {}
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ # enter the processing here
+ record_counter += 1
+
+ #cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+
+ # test print the entity
+ if entity != None:
+ if entity.TYPE not in files:
+ files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+ json_entity = entity.to_json()
+ print(json_entity)
+ writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+ writer.writerow(entity.to_csv_dict())
+ # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+ # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+ # entity.print_entity()
+
+ # TODO analys and upload the entity
+
+
+ # import pdb; pdb.set_trace()
+ print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
+ cleaned_element.getroot().text)
+ element.clear()
+ print(record_counter)
+
+
+if __name__ == '__main__':
+ read_file(r"../../NLI-nnl10.xml", 'record')
diff --git a/testers/factorytester.py b/testers/factorytester.py index 88e660d..55ebe7c 100644 --- a/testers/factorytester.py +++ b/testers/factorytester.py @@ -1,21 +1,21 @@ -from __future__ import absolute_import -import parsers -import factories -import xml.etree.cElementTree as ET - -xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml" -whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] - - -xmltree = ET.parse(xmlpath) -entities = list() -inl_factory = factories.INLFactory() - -for record in xmltree.getroot(): - inl_parser = parsers.INLXmlParser(record, whitelist) - clean_record = inl_parser.clearxml() - entities.append(inl_factory.get_entity(clean_record)) - -for entity in entities: - print(entity) - +from __future__ import absolute_import
+import parsers
+import factories
+import xml.etree.cElementTree as ET
+
+xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
+whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
+
+
+xmltree = ET.parse(xmlpath)
+entities = list()
+inl_factory = factories.INLFactory()
+
+for record in xmltree.getroot():
+ inl_parser = parsers.INLXmlParser(record, whitelist)
+ clean_record = inl_parser.clearxml()
+ entities.append(inl_factory.get_entity(clean_record))
+
+for entity in entities:
+ print(entity)
+
diff --git a/writers/wd_writer.py b/writers/wd_writer.py index b88833f..4a456e5 100644 --- a/writers/wd_writer.py +++ b/writers/wd_writer.py @@ -1,6 +1,7 @@ -import pywikibot -from pywikibot import pagegenerators, WikidataBot - -class WDWriter(object): - def __init__(self): - pass
\ No newline at end of file +import pywikibot
+from pywikibot import pagegenerators, WikidataBot
+
+
+class WDWriter(object):
+ def __init__(self, source_path, reader, factory):
+ self.source_path = source_path
|