summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTzafrir Cohen <tzafrir@cohens.org.il>2016-09-25 20:28:16 +0300
committerTzafrir Cohen <tzafrir@cohens.org.il>2016-09-25 20:28:16 +0300
commitd646c9a42273e98c85602f5618598125007bbfaa (patch)
treed6173d42320f0d4fe702581f34e5c1304eb12089
parent41125eb195324d18d9c2c12aa12ecbf66dc5d495 (diff)
WIP: commit all files that were changed
-rw-r--r--.gitignore274
-rw-r--r--.idea/misc.xml10
-rw-r--r--entities/__init__.py4
-rw-r--r--entities/basic_entity.py10
-rw-r--r--entities/institution.py12
-rw-r--r--entities/location.py50
-rw-r--r--entities/person.py152
-rw-r--r--factories/INL_factory.py260
-rw-r--r--factories/__init__.py2
-rw-r--r--factories/basic_factory.py6
-rw-r--r--libs/json_tools.py18
-rw-r--r--parsers/INL_xml_parser.py72
-rw-r--r--parsers/__init__.py2
-rw-r--r--parsers/basic_parser.py12
-rw-r--r--readers/xml_reader.py122
-rw-r--r--testers/factorytester.py42
-rw-r--r--writers/wd_writer.py13
17 files changed, 536 insertions, 525 deletions
diff --git a/.gitignore b/.gitignore
index 7ebdd82..7c59bc9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,138 +1,138 @@
-# Created by .ignore support plugin (hsz.mobi)
-### JetBrains template
-# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
-# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
-
-# User-specific stuff:
-.idea/workspace.xml
-.idea/tasks.xml
-.idea/dictionaries
-.idea/vcs.xml
-.idea/jsLibraryMappings.xml
-
-# Sensitive or high-churn files:
-.idea/dataSources.ids
-.idea/dataSources.xml
-.idea/dataSources.local.xml
-.idea/sqlDataSources.xml
-.idea/dynamic.xml
-.idea/uiDesigner.xml
-
-# Gradle:
-.idea/gradle.xml
-.idea/libraries
-
-# Mongo Explorer plugin:
-.idea/mongoSettings.xml
-
-## File-based project format:
-*.iws
-
-## Plugin-specific files:
-
-# IntelliJ
-/out/
-
-# mpeltonen/sbt-idea plugin
-.idea_modules/
-
-# JIRA plugin
-atlassian-ide-plugin.xml
-
-# Crashlytics plugin (for Android Studio and IntelliJ)
-com_crashlytics_export_strings.xml
-crashlytics.properties
-crashlytics-build.properties
-fabric.properties
-### Python template
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*,cover
-.hypothesis/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# IPython Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# dotenv
-.env
-
-# virtualenv
-venv/
-ENV/
-
-# Spyder project settings
-.spyderproject
-
-# Rope project settings
-.ropeproject
-
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/vcs.xml
+.idea/jsLibraryMappings.xml
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources.local.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
.out/* \ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index d3cc99c..84919a4 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
+ <component name="ProjectLevelVcsManager" settingsEditedManually="false">
+ <OptionsSetting value="true" id="Add" />
+ <OptionsSetting value="true" id="Remove" />
+ <OptionsSetting value="true" id="Checkout" />
+ <OptionsSetting value="true" id="Update" />
+ <OptionsSetting value="true" id="Status" />
+ <OptionsSetting value="true" id="Edit" />
+ <ConfirmationsSetting value="0" id="Add" />
+ <ConfirmationsSetting value="0" id="Remove" />
+ </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (C:\Program Files (x86)\Python35-32\python.exe)" project-jdk-type="Python SDK" />
</project> \ No newline at end of file
diff --git a/entities/__init__.py b/entities/__init__.py
index 907ef4d..701846e 100644
--- a/entities/__init__.py
+++ b/entities/__init__.py
@@ -1,3 +1,3 @@
-from entities.person import Person
-from entities.institution import Institution
+from entities.person import Person
+from entities.institution import Institution
from entities.location import Location \ No newline at end of file
diff --git a/entities/basic_entity.py b/entities/basic_entity.py
index 9181422..9e8f11b 100644
--- a/entities/basic_entity.py
+++ b/entities/basic_entity.py
@@ -1,5 +1,5 @@
-from libs import JsonSerializable
-
-
-class BasicEntity(JsonSerializable):
- pass
+from libs import JsonSerializable
+
+
+class BasicEntity(JsonSerializable):
+ pass
diff --git a/entities/institution.py b/entities/institution.py
index 4538207..6be86fc 100644
--- a/entities/institution.py
+++ b/entities/institution.py
@@ -1,6 +1,6 @@
-from entities.basic_entity import BasicEntity
-
-
-class Institution(BasicEntity):
- def __init__(self):
- raise NotImplementedError()
+from entities.basic_entity import BasicEntity
+
+
+class Institution(BasicEntity):
+ def __init__(self):
+ raise NotImplementedError()
diff --git a/entities/location.py b/entities/location.py
index a43eb8d..f782e1f 100644
--- a/entities/location.py
+++ b/entities/location.py
@@ -1,25 +1,25 @@
-import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Location(BasicEntity):
- def __init__(self, name, types_of_place, name_in_langs, comments_list):
- self.name = name
- self.types_of_place = types_of_place
- self.name_in_langs = name_in_langs
- self.comments_list = comments_list
-
- CSV_FIELDS = ["name", "comments"]
- TYPE = "LOCATION"
-
-
- def print_entity(self):
- print("Name = " + self.name)
- print("Name in langs = " + str(self.name_in_langs))
- print("Types = " + str(self.types_of_place))
- print("Comments = " + str(self.comments_list))
-
- def to_csv_dict(self):
- return {'name': self.name,
- 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Location(BasicEntity):
+ def __init__(self, name, types_of_place, name_in_langs, comments_list):
+ self.name = name
+ self.types_of_place = types_of_place
+ self.name_in_langs = name_in_langs
+ self.comments_list = comments_list
+
+ CSV_FIELDS = ["name", "comments"]
+ TYPE = "LOCATION"
+
+
+ def print_entity(self):
+ print("Name = " + self.name)
+ print("Name in langs = " + str(self.name_in_langs))
+ print("Types = " + str(self.types_of_place))
+ print("Comments = " + str(self.comments_list))
+
+ def to_csv_dict(self):
+ return {'name': self.name,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/entities/person.py b/entities/person.py
index b315aac..a5aa396 100644
--- a/entities/person.py
+++ b/entities/person.py
@@ -1,76 +1,76 @@
-import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Person(BasicEntity):
- def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
- """
-
- :param name:
- :param date_of_birth:
- :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
- {
- "latin": "George"
- "heb": "[george in hebrew]"
- }
- """
- self.name = name
- years_parts = date_of_birth.split('-')
- if (len(years_parts) == 2):
- self.birth_year = years_parts[0]
- self.death_year = years_parts[1]
- else:
- self.birth_year = date_of_birth.strip()
- self.death_year = ''
- self.name_in_langs = name_in_langs
- '''
- place_of_birth = list()
- place_of_death = list()
- profession = list()
- for comment in bio_data:
- encoded_comment = ''.join(comment).strip()
- if encoded_comment.startswith(u"מקום לידה: "):
- place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
- if encoded_comment.startswith(u"מקום פטירה: "):
- place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
- if encoded_comment.startswith(u"מקצוע: "):
- profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
-
- self.place_of_birth = place_of_birth
- self.place_of_death = place_of_death
- self.profession = profession
- '''
- bio_data_dict = dict()
- for elem in bio_data:
- elem_splitted = elem.split(":")
- if len(elem_splitted) == 2:
- bio_data_key = elem_splitted[0]
- bio_data_value = elem_splitted[1]
- if bio_data_key in bio_data_dict:
- bio_data_dict.get(bio_data_key).append(bio_data_value)
- else:
- bio_data_dict.update(
- {bio_data_key: [bio_data_value]}
- )
- else:
- bio_data_dict.update({elem: ''})
- self.bio_data = bio_data_dict
- self.comments_list = comments_list
- self.profession = profession
-
- CSV_FIELDS = ["name", "biodata", "comments"]
- TYPE = 'PERSON'
-
- def print_entity(self):
- print("Name = " + self.name)
- print("Birth year = " + self.birth_year)
- print("Death year = " + self.death_year)
- print("Names in langs = " + str(self.name_in_langs))
- print("Bio Data = " + str(self.bio_data))
- print("Comments = " + str(self.comments_list))
- print("Profession = " + str(self.profession))
-
- def to_csv_dict(self):
- return {'name': self.name, 'biodata': self.bio_data,
- 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Person(BasicEntity):
+ def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
+ """
+
+ :param name:
+ :param date_of_birth:
+ :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
+ {
+ "latin": "George"
+ "heb": "[george in hebrew]"
+ }
+ """
+ self.name = name
+ years_parts = date_of_birth.split('-')
+ if (len(years_parts) == 2):
+ self.birth_year = years_parts[0]
+ self.death_year = years_parts[1]
+ else:
+ self.birth_year = date_of_birth.strip()
+ self.death_year = ''
+ self.name_in_langs = name_in_langs
+ '''
+ place_of_birth = list()
+ place_of_death = list()
+ profession = list()
+ for comment in bio_data:
+ encoded_comment = ''.join(comment).strip()
+ if encoded_comment.startswith(u"מקום לידה: "):
+ place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
+ if encoded_comment.startswith(u"מקום פטירה: "):
+ place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+ if encoded_comment.startswith(u"מקצוע: "):
+ profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+
+ self.place_of_birth = place_of_birth
+ self.place_of_death = place_of_death
+ self.profession = profession
+ '''
+ bio_data_dict = dict()
+ for elem in bio_data:
+ elem_splitted = elem.split(":")
+ if len(elem_splitted) == 2:
+ bio_data_key = elem_splitted[0]
+ bio_data_value = elem_splitted[1]
+ if bio_data_key in bio_data_dict:
+ bio_data_dict.get(bio_data_key).append(bio_data_value)
+ else:
+ bio_data_dict.update(
+ {bio_data_key: [bio_data_value]}
+ )
+ else:
+ bio_data_dict.update({elem: ''})
+ self.bio_data = bio_data_dict
+ self.comments_list = comments_list
+ self.profession = profession
+
+ CSV_FIELDS = ["name", "biodata", "comments"]
+ TYPE = 'PERSON'
+
+ def print_entity(self):
+ print("Name = " + self.name)
+ print("Birth year = " + self.birth_year)
+ print("Death year = " + self.death_year)
+ print("Names in langs = " + str(self.name_in_langs))
+ print("Bio Data = " + str(self.bio_data))
+ print("Comments = " + str(self.comments_list))
+ print("Profession = " + str(self.profession))
+
+ def to_csv_dict(self):
+ return {'name': self.name, 'biodata': self.bio_data,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/factories/INL_factory.py b/factories/INL_factory.py
index f4e494f..286762a 100644
--- a/factories/INL_factory.py
+++ b/factories/INL_factory.py
@@ -1,130 +1,130 @@
-import entities
-from factories import BasicFactory
-import xml.etree.cElementTree as ET
-
-TAG_TO_ENTITY_MAPPING = {
- '100': entities.Person,
- '110': entities.Institution,
- '151': entities.Location
-}
-
-
-ENTITY_KEYS = {
- '100.a': 'name',
- '100.9': 'name_langindic',
- '100.d': 'date_of_birth',
- '400.a': 'name_in_langs',
- '400.9': 'langs_langindic',
- '678.a': 'bio_data',
- '151.a': 'name',
- '151.9': 'name_langindic',
- '451:a': 'name_in_langs',
- '451:9': 'langs_langindic',
- '550.a': 'type_of_place',
- '667.a': 'comment',
- '374.a': 'profession'
-}
-
-
-def get_record_key(record):
- root = record.getroot()
- for field in root:
- field_tag = field.attrib.get('tag')
- if '100' in field_tag:
- return '100'
- if '151' in field_tag:
- return '151'
- if '110' in field_tag:
- return '110'
-
-class INLFactory(BasicFactory):
- def __init__(self, tag_to_entity_mapping=None):
- self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
-
- def get_entity(self, raw_object, entity_keys=ENTITY_KEYS):
- record_key = get_record_key(raw_object)
- #100 is person
- if record_key == '100':
- name = ''
- name_in_langs = dict()
- bio_data = list()
- comment_list = list()
- eng_name = ''
- date_of_birth = ''
- profession = list()
- name_diff = ''
- #get the names and date of birth and bio data
- for field in raw_object.getroot():
- key = field.attrib.get('tag')
- tag = entity_keys.get(key)
- if tag == 'name':
- name = field.text
- elif tag == 'name_langindic':
- # chack if this english name
- if field.text == 'lat':
- eng_name = name
- # else add it to name_in_langs
- else:
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name)
- else:
- name_in_langs.update({field.text: [name]})
- elif tag == 'date_of_birth':
- date_of_birth = field.text
- elif tag == 'name_in_langs':
- name_diff = field.text
- elif tag == 'langs_langindic':
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name_diff)
- else:
- name_in_langs.update({field.text: [name_diff]})
- elif tag == 'bio_data':
- bio_data.append(field.text)
- elif tag == 'comment':
- comment_list.append(field.text)
- elif tag == 'profession':
- profession.append(field.text)
- return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
- #110 is institue
- elif record_key == '110':
- return entities.Institution()
- #151 is location
- elif record_key == '151':
- name_in_langs = dict()
- types_of_place = list()
- comment_list = list()
- eng_name = ''
- name_diff = ''
-
- for field in raw_object.getroot():
- key = field.attrib.get('tag')
- tag = entity_keys.get(key)
- if tag == 'name':
- name = field.text
- elif tag == 'name_langindic':
- # chack if this english name
- if field.text == 'lat':
- eng_name = name
- # else add it to name_in_langs
- else:
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name)
- else:
- name_in_langs.update({field.text: [name]})
- elif tag == 'type_of_place':
- types_of_place.append(field.text)
- elif tag == 'name_in_langs':
- name_diff = field.text
- elif tag == 'langs_langindic':
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name_diff)
- else:
- name_in_langs.update({field.text: [name_diff]})
- elif tag == 'comment':
- comment_list.append(field.text)
- return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
- else:
- return None
- # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
-
-
+import entities
+from factories import BasicFactory
+import xml.etree.cElementTree as ET
+
+TAG_TO_ENTITY_MAPPING = {
+ '100': entities.Person,
+ '110': entities.Institution,
+ '151': entities.Location
+}
+
+
+ENTITY_KEYS = {
+ '100.a': 'name',
+ '100.9': 'name_langindic',
+ '100.d': 'date_of_birth',
+ '400.a': 'name_in_langs',
+ '400.9': 'langs_langindic',
+ '678.a': 'bio_data',
+ '151.a': 'name',
+ '151.9': 'name_langindic',
+ '451:a': 'name_in_langs',
+ '451:9': 'langs_langindic',
+ '550.a': 'type_of_place',
+ '667.a': 'comment',
+ '374.a': 'profession'
+}
+
+
+def get_record_key(record):
+ root = record.getroot()
+ for field in root:
+ field_tag = field.attrib.get('tag')
+ if '100' in field_tag:
+ return '100'
+ if '151' in field_tag:
+ return '151'
+ if '110' in field_tag:
+ return '110'
+
+class INLFactory(BasicFactory):
+ def __init__(self, tag_to_entity_mapping=None):
+ self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
+
+ def get_entity(self, raw_object, entity_keys=ENTITY_KEYS):
+ record_key = get_record_key(raw_object)
+ #100 is person
+ if record_key == '100':
+ name = ''
+ name_in_langs = dict()
+ bio_data = list()
+ comment_list = list()
+ eng_name = ''
+ date_of_birth = ''
+ profession = list()
+ name_diff = ''
+ #get the names and date of birth and bio data
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'date_of_birth':
+ date_of_birth = field.text
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'bio_data':
+ bio_data.append(field.text)
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ elif tag == 'profession':
+ profession.append(field.text)
+ return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
+ #110 is institue
+ elif record_key == '110':
+ return entities.Institution()
+ #151 is location
+ elif record_key == '151':
+ name_in_langs = dict()
+ types_of_place = list()
+ comment_list = list()
+ eng_name = ''
+ name_diff = ''
+
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'type_of_place':
+ types_of_place.append(field.text)
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
+ else:
+ return None
+ # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
+
+
diff --git a/factories/__init__.py b/factories/__init__.py
index 86901f5..947845c 100644
--- a/factories/__init__.py
+++ b/factories/__init__.py
@@ -1,2 +1,2 @@
-from factories.basic_factory import BasicFactory
+from factories.basic_factory import BasicFactory
from factories.INL_factory import INLFactory \ No newline at end of file
diff --git a/factories/basic_factory.py b/factories/basic_factory.py
index 1715846..1974d65 100644
--- a/factories/basic_factory.py
+++ b/factories/basic_factory.py
@@ -1,3 +1,3 @@
-class BasicFactory(object):
- def get_entity(self, entity_key, raw_object):
- raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
+class BasicFactory(object):
+ def get_entity(self, entity_key, raw_object):
+ raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
diff --git a/libs/json_tools.py b/libs/json_tools.py
index 5e78d23..9ce19b0 100644
--- a/libs/json_tools.py
+++ b/libs/json_tools.py
@@ -1,9 +1,9 @@
-import json
-
-
-class JsonSerializable(object):
- def __repr__(self):
- return str(self.to_json())
-
- def to_json(self):
- return json.dumps(self.__dict__, ensure_ascii=False)
+import json
+
+
+class JsonSerializable(object):
+ def __repr__(self):
+ return str(self.to_json())
+
+ def to_json(self):
+ return json.dumps(self.__dict__, ensure_ascii=False)
diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py
index 1a06f6b..cdde5a8 100644
--- a/parsers/INL_xml_parser.py
+++ b/parsers/INL_xml_parser.py
@@ -1,36 +1,36 @@
-try:
- import xml.etree.cElementTree as ET
-except ImportError:
- import xml.etree.ElementTree as ET
-
-KNOWN_FIELD_TAGS = ['100', '110', '151']
-
-TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
-
-class INLXmlParser:
- def __init__(self, reader, whitelist=TAG_WHITELIST):
- self.reader = reader
- #self.whitelist = whitelist or KNOWN_FIELD_TAGS
- self.whitelist = whitelist
-
- def clearxml(self):
-
- # # scan the datafields in the records and copy to the new one only the tags in the whitelist
- # for record in root: # create new record
- newRecord = ET.Element('record')
- for field in self.reader:
- fieldtag = field.attrib.get('tag')
- if fieldtag in self.whitelist:
- temptag = fieldtag
- # tag 700 and 400 are the same
- if temptag == '700':
- temptag = '400'
- for data in field:
- newFieldTag = temptag
- newFieldTag += '.'
- newFieldTag += data.attrib.get('code')
- newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
- newTag.text = data.text
-
- newRecordTree = ET.ElementTree(newRecord)
- return ET.ElementTree(newRecord)
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
+
+class INLXmlParser:
+ def __init__(self, reader, whitelist=TAG_WHITELIST):
+ self.reader = reader
+ #self.whitelist = whitelist or KNOWN_FIELD_TAGS
+ self.whitelist = whitelist
+
+ def clearxml(self):
+
+ # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+ # for record in root: # create new record
+ newRecord = ET.Element('record')
+ for field in self.reader:
+ fieldtag = field.attrib.get('tag')
+ if fieldtag in self.whitelist:
+ temptag = fieldtag
+ # tag 700 and 400 are the same
+ if temptag == '700':
+ temptag = '400'
+ for data in field:
+ newFieldTag = temptag
+ newFieldTag += '.'
+ newFieldTag += data.attrib.get('code')
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+ newTag.text = data.text
+
+ newRecordTree = ET.ElementTree(newRecord)
+ return ET.ElementTree(newRecord)
diff --git a/parsers/__init__.py b/parsers/__init__.py
index d32c917..07907f9 100644
--- a/parsers/__init__.py
+++ b/parsers/__init__.py
@@ -1,2 +1,2 @@
-
+
from .INL_xml_parser import INLXmlParser \ No newline at end of file
diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py
index dae19cb..32c1b43 100644
--- a/parsers/basic_parser.py
+++ b/parsers/basic_parser.py
@@ -1,6 +1,6 @@
-class BasicParser(object):
- def __init__(self):
- pass
-
- def parse(self, data):
- raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
+class BasicParser(object):
+ def __init__(self):
+ pass
+
+ def parse(self, data):
+ raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
index 5b2d1fd..710899d 100644
--- a/readers/xml_reader.py
+++ b/readers/xml_reader.py
@@ -1,61 +1,61 @@
-# from __future__ import absolute_import
-import json
-import csv
-import parsers, factories
-from entities import Person
-
-try:
- import xml.etree.cElementTree as ET
-except ImportError:
- import xml.etree.ElementTree as ET
-
-def read_file(path, element_key):
- # get an iterable
- record_counter = 0
- context = ET.iterparse(path, events=("start", "end"))
-
- # turn it into an iterator
- context = iter(context)
-
- # get the root element
- event, root = context.__next__()
-
- # the factory
- inl_factory = factories.INLFactory()
- files = {}
- for event, element in context:
- if 'end' in event:
- if element_key in element.tag:
- # enter the processing here
- record_counter += 1
-
- #cleaned element is a tree
- inl_parser = parsers.INLXmlParser(element)
- cleaned_element = inl_parser.clearxml()
- entity = inl_factory.get_entity(cleaned_element)
-
- # test print the entity
- if entity != None:
- if entity.TYPE not in files:
- files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
- json_entity = entity.to_json()
- print(json_entity)
- writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
- writer.writerow(entity.to_csv_dict())
- # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
- # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
-
- # entity.print_entity()
-
- # TODO analys and upload the entity
-
-
- # import pdb; pdb.set_trace()
- print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
- cleaned_element.getroot().text)
- element.clear()
- print(record_counter)
-
-
-if __name__ == '__main__':
- read_file(r"../../NLI-nnl10.xml", 'record')
+# from __future__ import absolute_import
+import json
+import csv
+import parsers, factories
+from entities import Person
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+def read_file(path, element_key):
+ # get an iterable
+ record_counter = 0
+ context = ET.iterparse(path, events=("start", "end"))
+
+ # turn it into an iterator
+ context = iter(context)
+
+ # get the root element
+ event, root = context.__next__()
+
+ # the factory
+ inl_factory = factories.INLFactory()
+ files = {}
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ # enter the processing here
+ record_counter += 1
+
+ #cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+
+ # test print the entity
+ if entity != None:
+ if entity.TYPE not in files:
+ files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+ json_entity = entity.to_json()
+ print(json_entity)
+ writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+ writer.writerow(entity.to_csv_dict())
+ # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+ # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+ # entity.print_entity()
+
+ # TODO analys and upload the entity
+
+
+ # import pdb; pdb.set_trace()
+ print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
+ cleaned_element.getroot().text)
+ element.clear()
+ print(record_counter)
+
+
+if __name__ == '__main__':
+ read_file(r"../../NLI-nnl10.xml", 'record')
diff --git a/testers/factorytester.py b/testers/factorytester.py
index 88e660d..55ebe7c 100644
--- a/testers/factorytester.py
+++ b/testers/factorytester.py
@@ -1,21 +1,21 @@
-from __future__ import absolute_import
-import parsers
-import factories
-import xml.etree.cElementTree as ET
-
-xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
-whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
-
-
-xmltree = ET.parse(xmlpath)
-entities = list()
-inl_factory = factories.INLFactory()
-
-for record in xmltree.getroot():
- inl_parser = parsers.INLXmlParser(record, whitelist)
- clean_record = inl_parser.clearxml()
- entities.append(inl_factory.get_entity(clean_record))
-
-for entity in entities:
- print(entity)
-
+from __future__ import absolute_import
+import parsers
+import factories
+import xml.etree.cElementTree as ET
+
+xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
+whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
+
+
+xmltree = ET.parse(xmlpath)
+entities = list()
+inl_factory = factories.INLFactory()
+
+for record in xmltree.getroot():
+ inl_parser = parsers.INLXmlParser(record, whitelist)
+ clean_record = inl_parser.clearxml()
+ entities.append(inl_factory.get_entity(clean_record))
+
+for entity in entities:
+ print(entity)
+
diff --git a/writers/wd_writer.py b/writers/wd_writer.py
index b88833f..4a456e5 100644
--- a/writers/wd_writer.py
+++ b/writers/wd_writer.py
@@ -1,6 +1,7 @@
-import pywikibot
-from pywikibot import pagegenerators, WikidataBot
-
-class WDWriter(object):
- def __init__(self):
- pass \ No newline at end of file
+import pywikibot
+from pywikibot import pagegenerators, WikidataBot
+
+
+class WDWriter(object):
+ def __init__(self, source_path, reader, factory):
+ self.source_path = source_path