From cfed90c1096a92c4c9e622dfe2d55d892595b2ff Mon Sep 17 00:00:00 2001
From: Ido Ivri <ido@zencity.io>
Date: Sun, 30 Oct 2016 19:47:28 +0200
Subject: initial commit of work done in DataHack

---
 .gitignore                 | 138 ++++++++++++++++++++++++++++++++++++++++++
 .idea/misc.xml             |   4 ++
 .idea/modules.xml          |   8 +++
 .idea/parser.iml           |  12 ++++
 __init__.py                |   0
 clustering_678.py          | 100 +++++++++++++++++++++++++++++++
 entities/__init__.py       |   5 ++
 entities/basic_entity.py   |   5 ++
 entities/institution.py    |   7 +++
 entities/location.py       |  27 +++++++++
 entities/person.py         | 119 +++++++++++++++++++++++++++++++++++++
 entities/snaks.py          |  91 ++++++++++++++++++++++++++++
 factories/INL_factory.py   | 145 +++++++++++++++++++++++++++++++++++++++++++++
 factories/__init__.py      |   2 +
 factories/basic_factory.py |   3 +
 libs/__init__.py           |   1 +
 libs/json_tools.py         |  12 ++++
 parsers/INL_xml_parser.py  |  41 +++++++++++++
 parsers/__init__.py        |   2 +
 parsers/basic_parser.py    |   6 ++
 readers/__init__.py        |   0
 readers/xml_reader.py      |  58 ++++++++++++++++++
 testers/factorytester.py   |  19 ++++++
 user-config.py             |   0
 writers/__init__.py        |   0
 writers/wd_writer.py       |  26 ++++++++
 26 files changed, 831 insertions(+)
 create mode 100755 .gitignore
 create mode 100755 .idea/misc.xml
 create mode 100755 .idea/modules.xml
 create mode 100755 .idea/parser.iml
 create mode 100755 __init__.py
 create mode 100755 clustering_678.py
 create mode 100755 entities/__init__.py
 create mode 100755 entities/basic_entity.py
 create mode 100755 entities/institution.py
 create mode 100755 entities/location.py
 create mode 100755 entities/person.py
 create mode 100755 entities/snaks.py
 create mode 100755 factories/INL_factory.py
 create mode 100755 factories/__init__.py
 create mode 100755 factories/basic_factory.py
 create mode 100755 libs/__init__.py
 create mode 100755 libs/json_tools.py
 create mode 100755 parsers/INL_xml_parser.py
 create mode 100755 parsers/__init__.py
 create mode 100755 parsers/basic_parser.py
 create mode 100755 readers/__init__.py
 create mode 100755 readers/xml_reader.py
 create mode 100755 testers/factorytester.py
 create mode 100755 user-config.py
 create mode 100755 writers/__init__.py
 create mode 100755 writers/wd_writer.py

diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..7ebdd82
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,138 @@
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/vcs.xml
+.idea/jsLibraryMappings.xml
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources.local.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+.out/*
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100755
index 0000000..f778c9e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (C:\Users\Ilsar\Anaconda3\python.exe)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100755
index 0000000..405d108
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/parser.iml" filepath="$PROJECT_DIR$/.idea/parser.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/parser.iml b/.idea/parser.iml
new file mode 100755
index 0000000..6f63a63
--- /dev/null
+++ b/.idea/parser.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Nosetests" />
+    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/clustering_678.py b/clustering_678.py
new file mode 100755
index 0000000..99391ac
--- /dev/null
+++ b/clustering_678.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+import pandas as pd
+import unicodedata
+from sklearn.cluster import KMeans
+
+PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv"
+FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv"
+
+
+# We find all keys with an empty value:
+def return_keys_without_value(dic):
+    keys = []
+    for key, value in dic.items():
+        if value == "":  # todo: take care of keys without value that do contain : somewhere
+            keys.append(key)
+    if len(keys) > 0:
+        return keys
+    else:
+        return None
+
+
+data = pd.read_csv(PERSON_001_PATH, names=["string", "id"])
+data = data.dropna()
+data = data[data["string"] != "{}"]
+data["string"] = [eval(k) for k in data["string"]]
+data["string"] = [return_keys_without_value(dic) for dic in data["string"]]
+data = data.dropna()
+
+string_list = []
+id_list = []
+for _, row in data.iterrows():
+    for elem in row["string"]:
+        string_list.append(elem)
+        id_list.append(row["id"])
+new_data = pd.DataFrame({"string": string_list, "id": id_list})
+new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8")
+
+
+def is_all_hebrew(s):
+    try:
+        s = s.decode("utf-8")
+    except AttributeError:
+        pass
+
+    # remove all non-characters:
+    q = ""
+    for i in s:
+        if i.isalpha():
+            q = "".join([q, i])
+
+    return all('HEBREW' in unicodedata.name(c) for c in q)
+
+
+def is_all_english(s):
+    try:
+        s = s.decode("utf-8")
+    except AttributeError:
+        pass
+
+    # remove all non-characters:
+    chars_only = ""
+    for i in s:
+        if i.isalpha():
+            chars_only = "".join([chars_only, i])
+    return all('LATIN' in unicodedata.name(c) for c in chars_only)
+
+
+def count_words(s):
+    return len(s.split())
+
+
+# todo: add a feature "contains_predefined_year_prefixes", like b. or d.
+# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...)
+# todo: detect hebrew years using quotes
+new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew)
+new_data["is_all_english"] = new_data["string"].apply(is_all_english)
+new_data["number_of_words"] = new_data["string"].apply(count_words)
+new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s)
+new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s)
+new_data.to_csv(FEATURES_TABLE_PATH)
+
+X = new_data.copy()
+assert isinstance(X, pd.DataFrame)
+del X["id"]
+del X["string"]
+print(X.columns)
+X = (X - X.mean()) / (X.max() - X.min())  # normalizing the features
+
+range_n_clusters = [4, 6]
+for n_clusters in range_n_clusters:
+    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
+    cluster_labels = clusterer.fit_predict(X)
+    centers = clusterer.cluster_centers_
+    print("\n %s clusters:" % n_clusters)
+    print("cluster labels: %s" % cluster_labels)
+    print("cluster centers: %s " % centers)
+
+    for k in range(n_clusters):
+        print("\ncluster %d consists of the following strings:" % k)
+        print(new_data["string"][cluster_labels == k])
diff --git a/entities/__init__.py b/entities/__init__.py
new file mode 100755
index 0000000..1398576
--- /dev/null
+++ b/entities/__init__.py
@@ -0,0 +1,5 @@
+from entities.person import Person
+from entities.institution import Institution
+from entities.location import Location
+
+from entities.snaks import EntityIdSnak, GeoSnak, MonoLingualStringSnak, SomeValueSnak, StringSnak, TimeSnak, UrlSnak
\ No newline at end of file
diff --git a/entities/basic_entity.py b/entities/basic_entity.py
new file mode 100755
index 0000000..9181422
--- /dev/null
+++ b/entities/basic_entity.py
@@ -0,0 +1,5 @@
+from libs import JsonSerializable
+
+
+class BasicEntity(JsonSerializable):
+    pass
diff --git a/entities/institution.py b/entities/institution.py
new file mode 100755
index 0000000..5fa7570
--- /dev/null
+++ b/entities/institution.py
@@ -0,0 +1,7 @@
+from entities.basic_entity import BasicEntity
+
+
+class Institution(BasicEntity):
+    def __init__(self, viaf=None):
+        super().__init__(viaf)
+        raise NotImplementedError()
diff --git a/entities/location.py b/entities/location.py
new file mode 100755
index 0000000..cdec3a6
--- /dev/null
+++ b/entities/location.py
@@ -0,0 +1,27 @@
+import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Location(BasicEntity):
+    def __init__(self, name, types_of_place, name_in_langs, comments_list, viaf):
+        self.name = name
+        self.types_of_place = types_of_place
+        self.name_in_langs = name_in_langs
+        self.comments_list = comments_list
+        self.viaf = viaf
+
+    # CSV_FIELDS = ["name", "comments"]
+    CSV_FIELDS = ["viaf", "name", "types_of_place", "name_in_langs", "comments_list"]
+    TYPE = "LOCATION"
+
+
+    def print_entity(self):
+        print("Name = " + self.name)
+        print("Name in langs = " + str(self.name_in_langs))
+        print("Types = " + str(self.types_of_place))
+        print("Comments = " + str(self.comments_list))
+
+    def to_csv_dict(self):
+        return {'name': self.name,
+                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/entities/person.py b/entities/person.py
new file mode 100755
index 0000000..46ed315
--- /dev/null
+++ b/entities/person.py
@@ -0,0 +1,119 @@
+import json
+
+from entities.snaks import *
+from entities.basic_entity import BasicEntity
+
+
+class Person(BasicEntity):
+    def __init__(self, name, date_of_birth, date_of_death, name_in_langs, bio_data, comments_list, profession, viaf,
+                 national_lib_id):
+        """
+
+        :param name:
+        :param date_of_birth:
+        :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
+            {
+                "latin": "George"
+                "heb": "[george in hebrew]"
+            }
+        """
+        self.name = name
+        dob = [date_of_birth]
+        dod = [date_of_death]
+        self.name_in_langs = name_in_langs
+        self.national_lib_id = national_lib_id
+
+        bio_data_dict = dict()
+        struct_bio_data = dict()
+        for elem in bio_data:
+            elem_splitted = elem.split(":")
+            if len(elem_splitted) == 2:
+                bio_data_key = elem_splitted[0].strip()
+                bio_data_value = elem_splitted[1].strip()
+
+                if bio_data_key.startswith(u"תאריך לידה: "):
+                    dob.append(bio_data_value)
+                elif bio_data_key.startswith(u"תאריך פטירה: "):
+                    dod.append(bio_data_value)
+                elif bio_data_key.startswith(u"מקצוע: ") or bio_data_key.startswith(u"מיקצוע: "):
+                    profession.append(bio_data_value)
+                else:
+                    struct_bio_data[bio_data_key] = bio_data_value
+
+                if bio_data_key in bio_data_dict:
+                    bio_data_dict.get(bio_data_key).append(bio_data_value)
+                else:
+                    bio_data_dict.update(
+                        {bio_data_key: [bio_data_value]}
+                    )
+            else:
+                bio_data_dict.update({elem: ''})
+        self.bio_data = bio_data_dict
+        self.comments_list = comments_list
+        self.profession = profession
+        self.viaf = viaf
+        self.date_of_birth = dob
+        self.date_of_death = dod
+        self.struct_bio_data = struct_bio_data
+
+    # CSV_FIELDS = ["name", "biodata", "comments", "viaf"]
+
+    CSV_FIELDS = ["678 - biodata", "001 - national lib id"]
+    TYPE = 'PERSON'
+
+    # CSV_FIELDS = ["viaf", "name", "biodata", "comments"]
+    CSV_FIELDS = ["viaf", "national_lib_id", "name", "date_of_birth", "date_of_death", "name_in_langs", "bio_data",
+                  "struct_bio_data", "comments_list", "profession"]
+    TYPE = 'PERSON'
+
+    def print_entity(self):
+        print("Name = " + self.name)
+        print("Birth year = " + self.date_of_birth)
+        print("Death year = " + self.date_of_death)
+        print("Names in langs = " + str(self.name_in_langs))
+        print("Bio Data = " + json.dumps(self.bio_data))
+        print("Comments = " + json.dumps(self.comments_list))
+        print("Profession = " + json.dumps(self.profession))
+
+    def to_csv_dict(self):
+        return {'viaf': self.viaf, 'name': self.name, 'biodata': self.bio_data,
+                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+
+    def to_wd_claims(self):
+        claims = []
+
+        if self.date_of_birth:
+            claims.append({
+                "type": "claim",
+                "mainsnak": TimeSnak(property='P569', date=self.date_of_birth[0]).to_json()
+            })
+        if self.date_of_death:
+            claims.append({
+                "type": "claim",
+                "mainsnak": TimeSnak(property='P570', date=self.date_of_death[0]).to_json()
+            })
+        if self.profession:
+            for elem in self.profession:
+                claims.append({
+                    "type": "claim",
+                    "mainsnak": StringSnak(property='P106', value=elem).to_json()
+                })
+        if self.viaf:
+            claims.append({
+                "type": "claim",
+                "mainsnak": StringSnak(property='P214', value=self.viaf).to_json()
+            })
+        if self.struc_bio_data:
+            for bio_key, bio_value in self.struc_bio_data.items():
+                if bio_key.startswith(u"מקום לידה"):
+                    claims.append({
+                        "type": "claim",
+                        "mainsnak": StringSnak(property='P19', value=bio_value).to_json()
+                    })
+                if bio_key.startswith(u"מקום פטירה"):
+                    claims.append({
+                        "type": "claim",
+                        "mainsnak": StringSnak(property='p20', value=bio_value).to_json()
+                    })
+
+        return claims
diff --git a/entities/snaks.py b/entities/snaks.py
new file mode 100755
index 0000000..8874329
--- /dev/null
+++ b/entities/snaks.py
@@ -0,0 +1,91 @@
+from datetime import datetime
+
+from libs import JsonSerializable
+
+
+class BasicSnak(JsonSerializable):
+    def __init__(self, snaktype, property, datatype, datavalue):
+        self.snaktype = snaktype
+        self.property = property
+        self.datatype = datatype
+        self.datavalue = datavalue
+
+
+class StringSnak(BasicSnak):
+    def __init__(self, property, value):
+        datavalue = {
+            "type": "string",
+            "value": value
+        }
+        super().__init__(snaktype="value", property=property, datatype="string", datavalue=datavalue)
+
+
+class MonoLingualStringSnak(BasicSnak):
+    def __init__(self, property, value, language):
+        datavalue = {
+            "type": "monolingualtext",
+            "value": {
+                "language": language,
+                "text": value
+            }
+        }
+        super().__init__(snaktype="value", property=property, datatype="monolingualtext", datavalue=datavalue)
+
+
+class EntityIdSnak(BasicSnak):
+    def __init__(self, property, entity_type, entity_id):
+        datavalue = {
+            "value": {
+                "entity-type": entity_type,
+                "numeric-id": entity_id
+            },
+            "type": "wikibase-item"
+        }
+        super().__init__(snaktype="value", property=property, datatype="wikibase-entityid", datavalue=datavalue)
+
+
+class UrlSnak(BasicSnak):
+    def __init__(self, property, url):
+        datavalue = {
+            "type": "string",
+            "value": url
+        }
+        super().__init__(snaktype="value", property=property, datatype="url", datavalue=datavalue)
+
+
+class TimeSnak(BasicSnak):
+    def __init__(self, property, date, precision=11):
+        if not isinstance(date, datetime):
+            date = datetime(date)
+        datavalue = {
+            "value": {
+                "time": date.isoformat(),
+                "timezone": 0,
+                "before": 0,
+                "after": 0,
+                "precision": precision,
+                "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727"
+            },
+            "type": "time"
+        }
+        super().__init__(snaktype="value", property=property, datatype="time", datavalue=datavalue)
+
+
+class GeoSnak(BasicSnak):
+    def __init__(self, latitude, longitude, precision):
+        datavalue = {
+            "value": {
+                "latitude": latitude,
+                "longitude": longitude,
+                "altitude": None,
+                "precision": precision,
+                "globe": "http:\/\/www.wikidata.org\/entity\/Q2"
+            },
+            "type": "globecoordinate"
+        }
+        super().__init__(snaktype="value", property=property, datatype="globe-coordinate", datavalue=datavalue)
+
+
+class SomeValueSnak(BasicSnak):
+    def __init__(self, property):
+        super().__init__(snaktype="somevalue", property=property, datatype=None, datavalue=None)
diff --git a/factories/INL_factory.py b/factories/INL_factory.py
new file mode 100755
index 0000000..72b618c
--- /dev/null
+++ b/factories/INL_factory.py
@@ -0,0 +1,145 @@
+import entities
+from factories import BasicFactory
+import xml.etree.cElementTree as ET
+
+TAG_TO_ENTITY_MAPPING = {
+    '100': entities.Person,
+    '110': entities.Institution,
+    '151': entities.Location
+}
+
+
+ENTITY_KEYS = {
+    '100.a': 'name',
+    '100.9': 'name_langindic',
+    '046.f': 'date_of_birth',
+    '046.g': 'date_of_death',
+    '400.a': 'name_in_langs',
+    '400.9': 'langs_langindic',
+    '678.a': 'bio_data',
+    '151.a': 'name',
+    '151.9': 'name_langindic',
+    '451:a': 'name_in_langs',
+    '451:9': 'langs_langindic',
+    '550.a': 'type_of_place',
+    '667.a': 'comment',
+    '374.a': 'profession',
+    '901.a': 'viaf',
+    '001.' : 'national_lib_id',
+    '001' : 'national_lib_id',
+}
+
+
+def get_record_key(record):
+    root = record.getroot()
+    for field in root:
+        field_tag = field.attrib.get('tag')
+        if '100' in field_tag:
+            return '100'
+        if '151' in field_tag:
+            return '151'
+        if '110' in field_tag:
+            return '110'
+
+class INLFactory(BasicFactory):
+    def __init__(self, tag_to_entity_mapping=None):
+        self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
+
+    def get_entity(self,  raw_object, entity_keys=ENTITY_KEYS):
+        record_key = get_record_key(raw_object)
+        #100 is person
+        if record_key == '100':
+            name = ''
+            name_in_langs = dict()
+            bio_data = list()
+            comment_list = list()
+            eng_name = ''
+            profession = list()
+            name_diff = ''
+            date_of_birth = ''
+            date_of_death = ''
+            viaf = ''
+            national_lib_id = ''
+            #get the names and date of birth and bio data
+            for field in raw_object.getroot():
+                key = field.attrib.get('tag')
+                tag = entity_keys.get(key)
+                if tag == 'name':
+                    name = field.text
+                elif tag == 'name_langindic':
+                    # chack if this english name
+                    if field.text == 'lat':
+                        eng_name = name
+                    # else add it to name_in_langs
+                    else:
+                        if field.text in name_in_langs:
+                            name_in_langs.get(field.text).append(name)
+                        else:
+                            name_in_langs.update({field.text: [name]})
+                elif tag == 'date_of_birth':
+                    date_of_birth = field.text
+                elif tag == 'date_of_death':
+                    date_of_death = field.text
+                elif tag == 'name_in_langs':
+                    name_diff = field.text
+                elif tag == 'langs_langindic':
+                    if field.text in name_in_langs:
+                        name_in_langs.get(field.text).append(name_diff)
+                    else:
+                        name_in_langs.update({field.text: [name_diff]})
+                elif tag == 'bio_data':
+                    bio_data.append(field.text)
+                elif tag == 'comment':
+                    comment_list.append(field.text)
+                elif tag == 'profession':
+                    profession.append(field.text)
+                elif tag == 'viaf':
+                    viaf = field.text
+                elif tag == 'national_lib_id':
+                    national_lib_id = field.text
+            return entities.Person(eng_name, date_of_birth, date_of_death, name_in_langs, bio_data, comment_list, profession, viaf, national_lib_id)
+        #110 is institue
+        elif record_key == '110':
+            return entities.Institution()
+        #151 is location
+        elif record_key == '151':
+            name_in_langs = dict()
+            types_of_place = list()
+            comment_list = list()
+            eng_name = ''
+            name_diff = ''
+            viaf = ''
+            for field in raw_object.getroot():
+                key = field.attrib.get('tag')
+                tag = entity_keys.get(key)
+                if tag == 'name':
+                    name = field.text
+                elif tag == 'name_langindic':
+                    # chack if this english name
+                    if field.text == 'lat':
+                        eng_name = name
+                    # else add it to name_in_langs
+                    else:
+                        if field.text in name_in_langs:
+                            name_in_langs.get(field.text).append(name)
+                        else:
+                            name_in_langs.update({field.text: [name]})
+                elif tag == 'type_of_place':
+                    types_of_place.append(field.text)
+                elif tag == 'name_in_langs':
+                    name_diff = field.text
+                elif tag == 'langs_langindic':
+                    if field.text in name_in_langs:
+                        name_in_langs.get(field.text).append(name_diff)
+                    else:
+                        name_in_langs.update({field.text: [name_diff]})
+                elif tag == 'comment':
+                    comment_list.append(field.text)
+                elif tag == 'viaf':
+                    viaf = field.text
+            return entities.Location(eng_name, types_of_place , name_in_langs, comment_list, viaf)
+        else:
+            return None
+        #    raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
+
+
diff --git a/factories/__init__.py b/factories/__init__.py
new file mode 100755
index 0000000..86901f5
--- /dev/null
+++ b/factories/__init__.py
@@ -0,0 +1,2 @@
+from factories.basic_factory import BasicFactory
+from factories.INL_factory import INLFactory
\ No newline at end of file
diff --git a/factories/basic_factory.py b/factories/basic_factory.py
new file mode 100755
index 0000000..1715846
--- /dev/null
+++ b/factories/basic_factory.py
@@ -0,0 +1,3 @@
+class BasicFactory(object):
+    def get_entity(self, entity_key, raw_object):
+        raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
diff --git a/libs/__init__.py b/libs/__init__.py
new file mode 100755
index 0000000..c2514b7
--- /dev/null
+++ b/libs/__init__.py
@@ -0,0 +1 @@
+from libs.json_tools import JsonSerializable
\ No newline at end of file
diff --git a/libs/json_tools.py b/libs/json_tools.py
new file mode 100755
index 0000000..5c26b24
--- /dev/null
+++ b/libs/json_tools.py
@@ -0,0 +1,12 @@
+import json
+
+
+class JsonSerializable(object):
+    def __repr__(self):
+        return str(self.to_json())
+
+    def to_json(self):
+        return json.dumps(self.__dict__, ensure_ascii=False)
+
+    def to_dict(self):
+        return self.__dict__
\ No newline at end of file
diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py
new file mode 100755
index 0000000..f90e778
--- /dev/null
+++ b/parsers/INL_xml_parser.py
@@ -0,0 +1,41 @@
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374', '046', '901', '001']
+
+
+class INLXmlParser:
+    def __init__(self, reader, whitelist=TAG_WHITELIST):
+        self.reader = reader
+        # self.whitelist = whitelist or KNOWN_FIELD_TAGS
+        self.whitelist = whitelist
+
+    def clearxml(self):
+
+        # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+        # for record in root:    # create new record
+        newRecord = ET.Element('record')
+        for field in self.reader:
+            fieldtag = field.attrib.get('tag')
+            if fieldtag in self.whitelist:
+                temptag = fieldtag
+                if fieldtag == '001':
+                    newTag = ET.SubElement(newRecord, 'datafield', {'tag': '001'})
+                    newTag.text = field.text
+                else:
+                    # tag 700 and 400 are the same
+                    if temptag == '700':
+                        temptag = '400'
+                    for data in field:
+                        newFieldTag = temptag
+                        newFieldTag += '.'
+                        newFieldTag += data.attrib.get('code')
+                        newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+                        newTag.text = data.text
+
+        newRecordTree = ET.ElementTree(newRecord)
+        return ET.ElementTree(newRecord)
diff --git a/parsers/__init__.py b/parsers/__init__.py
new file mode 100755
index 0000000..d32c917
--- /dev/null
+++ b/parsers/__init__.py
@@ -0,0 +1,2 @@
+
+from .INL_xml_parser import INLXmlParser
\ No newline at end of file
diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py
new file mode 100755
index 0000000..dae19cb
--- /dev/null
+++ b/parsers/basic_parser.py
@@ -0,0 +1,6 @@
+class BasicParser(object):
+    def __init__(self):
+        pass
+
+    def parse(self, data):
+        raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
diff --git a/readers/__init__.py b/readers/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
new file mode 100755
index 0000000..4d8374e
--- /dev/null
+++ b/readers/xml_reader.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+import json
+import csv
+import parsers
+import factories
+from entities import Person
+from writers.wd_writer import get_entity_by_viaf
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+
+def read_file(path, element_key):
+    # get an iterable
+    record_counter = 0
+    context = ET.iterparse(path, events=("start", "end"))
+
+    # turn it into an iterator
+    context = iter(context)
+
+    # get the root element
+    event, root = context.__next__()
+
+    # the factory
+    inl_factory = factories.INLFactory()
+    files = {}
+    for event, element in context:
+        if 'end' in event:
+            if element_key in element.tag:
+                # enter the processing here
+                record_counter += 1
+
+                # cleaned element is a tree
+                inl_parser = parsers.INLXmlParser(element)
+                cleaned_element = inl_parser.clearxml()
+                entity = inl_factory.get_entity(cleaned_element)
+
+                # test print the entity
+                if entity != None:
+                    if entity.TYPE not in files:
+                        files[entity.TYPE] = open("out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+                    json_entity = entity.to_json()
+                    print(json_entity)
+                    #writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+                    #writer.writerow(entity.to_dict())
+
+                    if entity.viaf:
+                        print(get_entity_by_viaf(entity.viaf))
+
+                # TODO analys and upload the entity
+                element.clear()
+    print(record_counter)
+
+
+if __name__ == '__main__':
+    read_file(r"../../NLI-nnl10.xml", 'record')
diff --git a/testers/factorytester.py b/testers/factorytester.py
new file mode 100755
index 0000000..b6029ca
--- /dev/null
+++ b/testers/factorytester.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+import parsers
+import factories
+import xml.etree.cElementTree as ET
+
+xmlpath = 'C:/Users/Ilsar/Documents/datahack/xml_example.xml'
+
+xmltree = ET.parse(xmlpath)
+entities = list()
+inl_factory = factories.INLFactory()
+
+for record in xmltree.getroot():
+    inl_parser = parsers.INLXmlParser(record)
+    clean_record = inl_parser.clearxml()
+    entities.append(inl_factory.get_entity(clean_record))
+
+for entity in entities:
+    entity. print_entity()
+
diff --git a/user-config.py b/user-config.py
new file mode 100755
index 0000000..e69de29
diff --git a/writers/__init__.py b/writers/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/writers/wd_writer.py b/writers/wd_writer.py
new file mode 100755
index 0000000..62ab11c
--- /dev/null
+++ b/writers/wd_writer.py
@@ -0,0 +1,26 @@
+import pywikibot
+from pywikibot import pagegenerators
+from pywikibot.data import wikidataquery
+
+repo = pywikibot.Site().data_repository()
+
+
+def write_to_wd(entity):
+    if entity.viaf:
+        a = get_entity_by_viaf(entity.viaf)
+
+
+# Finds the matching record in Wikidata by VIAF identifier
+def get_entity_by_viaf(viaf):
+    sparql = "SELECT ?item WHERE {{ ?item wdt:P214 ?VIAF filter(?VIAF = '{}') }}".format(viaf)
+
+    entities = pagegenerators.WikidataQueryPageGenerator(sparql)
+    entities = list(entities)
+    if len(entities) == 0:
+        print("No entity found for VIAF: {}".format(viaf))
+        return None
+    elif len(entities) > 1:
+        # TODO: is it possible to have multiple VIAFs?
+        raise Exception('VIAF is expected to be unique')
+    import pdb; pdb.set_trace()
+    return entities[0]
\ No newline at end of file
-- 
cgit v1.2.3