@@ -0,0 +1,100 @@
new file mode 100755
index 0000000..99391ac
--- /dev/null
+++ b/
@@ -0,0 +1,100 @@
+# coding=utf-8
+import pandas as pd
+import unicodedata
+from sklearn.cluster import KMeans
+PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv"
+FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv"
+# We find all keys with an empty value:
+def return_keys_without_value(dic):
+ keys = []
+ for key, value in dic.items():
+ if value == "": # todo: take care of keys without value that do contain : somewhere
+ keys.append(key)
+ if len(keys) > 0:
+ return keys
+ else:
+ return None
+data = pd.read_csv(PERSON_001_PATH, names=["string", "id"])
+data = data.dropna()
+data = data[data["string"] != "{}"]
+data["string"] = [eval(k) for k in data["string"]]
+data["string"] = [return_keys_without_value(dic) for dic in data["string"]]
+data = data.dropna()
+string_list = []
+id_list = []
+for _, row in data.iterrows():
+ for elem in row["string"]:
+ string_list.append(elem)
+ id_list.append(row["id"])
+new_data = pd.DataFrame({"string": string_list, "id": id_list})
+new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8")
+def is_all_hebrew(s):
+ try:
+ s = s.decode("utf-8")
+ except AttributeError:
+ pass
+ # remove all non-characters:
+ q = ""
+ for i in s:
+ if i.isalpha():
+ q = "".join([q, i])
+ return all('HEBREW' in for c in q)
+def is_all_english(s):
+ try:
+ s = s.decode("utf-8")
+ except AttributeError:
+ pass
+ # remove all non-characters:
+ chars_only = ""
+ for i in s:
+ if i.isalpha():
+ chars_only = "".join([chars_only, i])
+ return all('LATIN' in for c in chars_only)
+def count_words(s):
+ return len(s.split())
+# todo: add a feature "contains_predefined_year_prefixes", like b. or d.
+# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...)
+# todo: detect hebrew years using quotes
+new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew)
+new_data["is_all_english"] = new_data["string"].apply(is_all_english)
+new_data["number_of_words"] = new_data["string"].apply(count_words)
+new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s)
+new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s)
+X = new_data.copy()
+assert isinstance(X, pd.DataFrame)
+del X["id"]
+del X["string"]
+X = (X - X.mean()) / (X.max() - X.min()) # normalizing the features
+range_n_clusters = [4, 6]
+for n_clusters in range_n_clusters:
+ clusterer = KMeans(n_clusters=n_clusters, random_state=10)
+ cluster_labels = clusterer.fit_predict(X)
+ centers = clusterer.cluster_centers_
+ print("\n %s clusters:" % n_clusters)
+ print("cluster labels: %s" % cluster_labels)
+ print("cluster centers: %s " % centers)
+ for k in range(n_clusters):
+ print("\ncluster %d consists of the following strings:" % k)
+ print(new_data["string"][cluster_labels == k])
@@ -0,0 +1,5 @@
new file mode 100755
index 0000000..1398576
--- /dev/null
+++ b/entities/
@@ -0,0 +1,5 @@
+from entities.person import Person
+from entities.institution import Institution
+from entities.location import Location
+from entities.snaks import EntityIdSnak, GeoSnak, MonoLingualStringSnak, SomeValueSnak, StringSnak, TimeSnak, UrlSnak \ No newline at end of file
@@ -0,0 +1,5 @@
new file mode 100755
index 0000000..9181422
--- /dev/null
+++ b/entities/
@@ -0,0 +1,5 @@
+from libs import JsonSerializable
+class BasicEntity(JsonSerializable):
+ pass
@@ -0,0 +1,7 @@
new file mode 100755
index 0000000..5fa7570
--- /dev/null
+++ b/entities/
@@ -0,0 +1,7 @@
+from entities.basic_entity import BasicEntity
+class Institution(BasicEntity):
+ def __init__(self, viaf=None):
+ super().__init__(viaf)
+ raise NotImplementedError()
@@ -0,0 +1,27 @@
new file mode 100755
index 0000000..cdec3a6
--- /dev/null
+++ b/entities/
@@ -0,0 +1,27 @@
+import json
+from entities.basic_entity import BasicEntity
+class Location(BasicEntity):
+ def __init__(self, name, types_of_place, name_in_langs, comments_list, viaf):
+ = name
+ self.types_of_place = types_of_place
+ self.name_in_langs = name_in_langs
+ self.comments_list = comments_list
+ self.viaf = viaf
+ # CSV_FIELDS = ["name", "comments"]
+ CSV_FIELDS = ["viaf", "name", "types_of_place", "name_in_langs", "comments_list"]
+ def print_entity(self):
+ print("Name = " +
+ print("Name in langs = " + str(self.name_in_langs))
+ print("Types = " + str(self.types_of_place))
+ print("Comments = " + str(self.comments_list))
+ def to_csv_dict(self):
+ return {'name':,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
@@ -0,0 +1,119 @@
new file mode 100755
index 0000000..46ed315
--- /dev/null
+++ b/entities/
@@ -0,0 +1,119 @@
+import json
+from entities.snaks import *
+from entities.basic_entity import BasicEntity
+class Person(BasicEntity):
+ def __init__(self, name, date_of_birth, date_of_death, name_in_langs, bio_data, comments_list, profession, viaf,
+ national_lib_id):
+ """
+ :param name:
+ :param date_of_birth:
+ :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
+ {
+ "latin": "George"
+ "heb": "[george in hebrew]"
+ }
+ """
+ = name
+ dob = [date_of_birth]
+ dod = [date_of_death]
+ self.name_in_langs = name_in_langs
+ self.national_lib_id = national_lib_id
+ bio_data_dict = dict()
+ struct_bio_data = dict()
+ for elem in bio_data:
+ elem_splitted = elem.split(":")
+ if len(elem_splitted) == 2:
+ bio_data_key = elem_splitted[0].strip()
+ bio_data_value = elem_splitted[1].strip()
+ if bio_data_key.startswith(u"תאריך לידה: "):
+ dob.append(bio_data_value)
+ elif bio_data_key.startswith(u"תאריך פטירה: "):
+ dod.append(bio_data_value)
+ elif bio_data_key.startswith(u"מקצוע: ") or bio_data_key.startswith(u"מיקצוע: "):
+ profession.append(bio_data_value)
+ else:
+ struct_bio_data[bio_data_key] = bio_data_value
+ if bio_data_key in bio_data_dict:
+ bio_data_dict.get(bio_data_key).append(bio_data_value)
+ else:
+ bio_data_dict.update(
+ {bio_data_key: [bio_data_value]}
+ )
+ else:
+ bio_data_dict.update({elem: ''})
+ self.bio_data = bio_data_dict
+ self.comments_list = comments_list
+ self.profession = profession
+ self.viaf = viaf
+ self.date_of_birth = dob
+ self.date_of_death = dod
+ self.struct_bio_data = struct_bio_data
+ # CSV_FIELDS = ["name", "biodata", "comments", "viaf"]
+ CSV_FIELDS = ["678 - biodata", "001 - national lib id"]
+ # CSV_FIELDS = ["viaf", "name", "biodata", "comments"]
+ CSV_FIELDS = ["viaf", "national_lib_id", "name", "date_of_birth", "date_of_death", "name_in_langs", "bio_data",
+ "struct_bio_data", "comments_list", "profession"]
+ def print_entity(self):
+ print("Name = " +
+ print("Birth year = " + self.date_of_birth)
+ print("Death year = " + self.date_of_death)
+ print("Names in langs = " + str(self.name_in_langs))
+ print("Bio Data = " + json.dumps(self.bio_data))
+ print("Comments = " + json.dumps(self.comments_list))
+ print("Profession = " + json.dumps(self.profession))
+ def to_csv_dict(self):
+ return {'viaf': self.viaf, 'name':, 'biodata': self.bio_data,
+ 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+ def to_wd_claims(self):
+ claims = []
+ if self.date_of_birth:
+ claims.append({
+ "type": "claim",
+ "mainsnak": TimeSnak(property='P569', date=self.date_of_birth[0]).to_json()
+ })
+ if self.date_of_death:
+ claims.append({
+ "type": "claim",
+ "mainsnak": TimeSnak(property='P570', date=self.date_of_death[0]).to_json()
+ })
+ if self.profession:
+ for elem in self.profession:
+ claims.append({
+ "type": "claim",
+ "mainsnak": StringSnak(property='P106', value=elem).to_json()
+ })
+ if self.viaf:
+ claims.append({
+ "type": "claim",
+ "mainsnak": StringSnak(property='P214', value=self.viaf).to_json()
+ })
+ if self.struc_bio_data:
+ for bio_key, bio_value in self.struc_bio_data.items():
+ if bio_key.startswith(u"מקום לידה"):
+ claims.append({
+ "type": "claim",
+ "mainsnak": StringSnak(property='P19', value=bio_value).to_json()
+ })
+ if bio_key.startswith(u"מקום פטירה"):
+ claims.append({
+ "type": "claim",
+ "mainsnak": StringSnak(property='p20', value=bio_value).to_json()
+ })
+ return claims
@@ -0,0 +1,91 @@
new file mode 100755
index 0000000..8874329
--- /dev/null
+++ b/entities/
@@ -0,0 +1,91 @@
+from datetime import datetime
+from libs import JsonSerializable
+class BasicSnak(JsonSerializable):
+ def __init__(self, snaktype, property, datatype, datavalue):
+ self.snaktype = snaktype
+ = property
+ self.datatype = datatype
+ self.datavalue = datavalue
+class StringSnak(BasicSnak):
+ def __init__(self, property, value):
+ datavalue = {
+ "type": "string",
+ "value": value
+ }
+ super().__init__(snaktype="value", property=property, datatype="string", datavalue=datavalue)
+class MonoLingualStringSnak(BasicSnak):
+ def __init__(self, property, value, language):
+ datavalue = {
+ "type": "monolingualtext",
+ "value": {
+ "language": language,
+ "text": value
+ }
+ }
+ super().__init__(snaktype="value", property=property, datatype="monolingualtext", datavalue=datavalue)
+class EntityIdSnak(BasicSnak):
+ def __init__(self, property, entity_type, entity_id):
+ datavalue = {
+ "value": {
+ "entity-type": entity_type,
+ "numeric-id": entity_id
+ },
+ "type": "wikibase-item"
+ }
+ super().__init__(snaktype="value", property=property, datatype="wikibase-entityid", datavalue=datavalue)
+class UrlSnak(BasicSnak):
+ def __init__(self, property, url):
+ datavalue = {
+ "type": "string",
+ "value": url
+ }
+ super().__init__(snaktype="value", property=property, datatype="url", datavalue=datavalue)
+class TimeSnak(BasicSnak):
+ def __init__(self, property, date, precision=11):
+ if not isinstance(date, datetime):
+ date = datetime(date)
+ datavalue = {
+ "value": {
+ "time": date.isoformat(),
+ "timezone": 0,
+ "before": 0,
+ "after": 0,
+ "precision": precision,
+ "calendarmodel": "http:\/\/\/entity\/Q1985727"
+ },
+ "type": "time"
+ }
+ super().__init__(snaktype="value", property=property, datatype="time", datavalue=datavalue)
+class GeoSnak(BasicSnak):
+ def __init__(self, latitude, longitude, precision):
+ datavalue = {
+ "value": {
+ "latitude": latitude,
+ "longitude": longitude,
+ "altitude": None,
+ "precision": precision,
+ "globe": "http:\/\/\/entity\/Q2"
+ },
+ "type": "globecoordinate"
+ }
+ super().__init__(snaktype="value", property=property, datatype="globe-coordinate", datavalue=datavalue)
+class SomeValueSnak(BasicSnak):
+ def __init__(self, property):
+ super().__init__(snaktype="somevalue", property=property, datatype=None, datavalue=None)
@@ -0,0 +1,145 @@
new file mode 100755
index 0000000..72b618c
--- /dev/null
+++ b/factories/
@@ -0,0 +1,145 @@
+import entities
+from factories import BasicFactory
+import xml.etree.cElementTree as ET
+ '100': entities.Person,
+ '110': entities.Institution,
+ '151': entities.Location
+ '100.a': 'name',
+ '100.9': 'name_langindic',
+ '046.f': 'date_of_birth',
+ '046.g': 'date_of_death',
+ '400.a': 'name_in_langs',
+ '400.9': 'langs_langindic',
+ '678.a': 'bio_data',
+ '151.a': 'name',
+ '151.9': 'name_langindic',
+ '451:a': 'name_in_langs',
+ '451:9': 'langs_langindic',
+ '550.a': 'type_of_place',
+ '667.a': 'comment',
+ '374.a': 'profession',
+ '901.a': 'viaf',
+ '001.' : 'national_lib_id',
+ '001' : 'national_lib_id',
+def get_record_key(record):
+ root = record.getroot()
+ for field in root:
+ field_tag = field.attrib.get('tag')
+ if '100' in field_tag:
+ return '100'
+ if '151' in field_tag:
+ return '151'
+ if '110' in field_tag:
+ return '110'
+class INLFactory(BasicFactory):
+ def __init__(self, tag_to_entity_mapping=None):
+ self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
+ def get_entity(self, raw_object, entity_keys=ENTITY_KEYS):
+ record_key = get_record_key(raw_object)
+ #100 is person
+ if record_key == '100':
+ name = ''
+ name_in_langs = dict()
+ bio_data = list()
+ comment_list = list()
+ eng_name = ''
+ profession = list()
+ name_diff = ''
+ date_of_birth = ''
+ date_of_death = ''
+ viaf = ''
+ national_lib_id = ''
+ #get the names and date of birth and bio data
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'date_of_birth':
+ date_of_birth = field.text
+ elif tag == 'date_of_death':
+ date_of_death = field.text
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'bio_data':
+ bio_data.append(field.text)
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ elif tag == 'profession':
+ profession.append(field.text)
+ elif tag == 'viaf':
+ viaf = field.text
+ elif tag == 'national_lib_id':
+ national_lib_id = field.text
+ return entities.Person(eng_name, date_of_birth, date_of_death, name_in_langs, bio_data, comment_list, profession, viaf, national_lib_id)
+ #110 is institue
+ elif record_key == '110':
+ return entities.Institution()
+ #151 is location
+ elif record_key == '151':
+ name_in_langs = dict()
+ types_of_place = list()
+ comment_list = list()
+ eng_name = ''
+ name_diff = ''
+ viaf = ''
+ for field in raw_object.getroot():
+ key = field.attrib.get('tag')
+ tag = entity_keys.get(key)
+ if tag == 'name':
+ name = field.text
+ elif tag == 'name_langindic':
+ # chack if this english name
+ if field.text == 'lat':
+ eng_name = name
+ # else add it to name_in_langs
+ else:
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name)
+ else:
+ name_in_langs.update({field.text: [name]})
+ elif tag == 'type_of_place':
+ types_of_place.append(field.text)
+ elif tag == 'name_in_langs':
+ name_diff = field.text
+ elif tag == 'langs_langindic':
+ if field.text in name_in_langs:
+ name_in_langs.get(field.text).append(name_diff)
+ else:
+ name_in_langs.update({field.text: [name_diff]})
+ elif tag == 'comment':
+ comment_list.append(field.text)
+ elif tag == 'viaf':
+ viaf = field.text
+ return entities.Location(eng_name, types_of_place , name_in_langs, comment_list, viaf)
+ else:
+ return None
+ # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
@@ -0,0 +1,2 @@
new file mode 100755
index 0000000..86901f5
--- /dev/null
+++ b/factories/
@@ -0,0 +1,2 @@
+from factories.basic_factory import BasicFactory
+from factories.INL_factory import INLFactory \ No newline at end of file
@@ -0,0 +1,3 @@
new file mode 100755
index 0000000..1715846
--- /dev/null
+++ b/factories/
@@ -0,0 +1,3 @@
+class BasicFactory(object):
+ def get_entity(self, entity_key, raw_object):
+ raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
@@ -0,0 +1 @@
new file mode 100755
index 0000000..c2514b7
--- /dev/null
+++ b/libs/
@@ -0,0 +1 @@
+from libs.json_tools import JsonSerializable \ No newline at end of file
@@ -0,0 +1,12 @@
new file mode 100755
index 0000000..5c26b24
--- /dev/null
+++ b/libs/
@@ -0,0 +1,12 @@
+import json
+class JsonSerializable(object):
+ def __repr__(self):
+ return str(self.to_json())
+ def to_json(self):
+ return json.dumps(self.__dict__, ensure_ascii=False)
+ def to_dict(self):
+ return self.__dict__ \ No newline at end of file
@@ -0,0 +1,41 @@
new file mode 100755
index 0000000..f90e778
--- /dev/null
+++ b/parsers/
@@ -0,0 +1,41 @@
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374', '046', '901', '001']
+class INLXmlParser:
+ def __init__(self, reader, whitelist=TAG_WHITELIST):
+ self.reader = reader
+ # self.whitelist = whitelist or KNOWN_FIELD_TAGS
+ self.whitelist = whitelist
+ def clearxml(self):
+ # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+ # for record in root: # create new record
+ newRecord = ET.Element('record')
+ for field in self.reader:
+ fieldtag = field.attrib.get('tag')
+ if fieldtag in self.whitelist:
+ temptag = fieldtag
+ if fieldtag == '001':
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': '001'})
+ newTag.text = field.text
+ else:
+ # tag 700 and 400 are the same
+ if temptag == '700':
+ temptag = '400'
+ for data in field:
+ newFieldTag = temptag
+ newFieldTag += '.'
+ newFieldTag += data.attrib.get('code')
+ newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+ newTag.text = data.text
+ newRecordTree = ET.ElementTree(newRecord)
+ return ET.ElementTree(newRecord)
@@ -0,0 +1,2 @@
new file mode 100755
index 0000000..d32c917
--- /dev/null
+++ b/parsers/
@@ -0,0 +1,2 @@
+from .INL_xml_parser import INLXmlParser \ No newline at end of file
@@ -0,0 +1,6 @@
new file mode 100755
index 0000000..dae19cb
--- /dev/null
+++ b/parsers/
@@ -0,0 +1,6 @@
+class BasicParser(object):
+ def __init__(self):
+ pass
+ def parse(self, data):
+ raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
@@ -0,0 +1,58 @@
new file mode 100755
index 0000000..e69de29
--- /dev/null
+++ b/readers/
diff --git a/readers/ b/readers/
new file mode 100755
index 0000000..4d8374e
--- /dev/null
+++ b/readers/
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+import json
+import csv
+import parsers
+import factories
+from entities import Person
+from writers.wd_writer import get_entity_by_viaf
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+def read_file(path, element_key):
+ # get an iterable
+ record_counter = 0
+ context = ET.iterparse(path, events=("start", "end"))
+ # turn it into an iterator
+ context = iter(context)
+ # get the root element
+ event, root = context.__next__()
+ # the factory
+ inl_factory = factories.INLFactory()
+ files = {}
+ for event, element in context:
+ if 'end' in event:
+ if element_key in element.tag:
+ # enter the processing here
+ record_counter += 1
+ # cleaned element is a tree
+ inl_parser = parsers.INLXmlParser(element)
+ cleaned_element = inl_parser.clearxml()
+ entity = inl_factory.get_entity(cleaned_element)
+ # test print the entity
+ if entity != None:
+ if entity.TYPE not in files:
+ files[entity.TYPE] = open("out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+ json_entity = entity.to_json()
+ print(json_entity)
+ #writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+ #writer.writerow(entity.to_dict())
+ if entity.viaf:
+ print(get_entity_by_viaf(entity.viaf))
+ # TODO analys and upload the entity
+ element.clear()
+ print(record_counter)
+if __name__ == '__main__':
+ read_file(r"../../NLI-nnl10.xml", 'record')
@@ -0,0 +1,19 @@
new file mode 100755
index 0000000..b6029ca
--- /dev/null
+++ b/testers/
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+import parsers
+import factories
+import xml.etree.cElementTree as ET
+xmlpath = 'C:/Users/Ilsar/Documents/datahack/xml_example.xml'
+xmltree = ET.parse(xmlpath)
+entities = list()
+inl_factory = factories.INLFactory()
+for record in xmltree.getroot():
+ inl_parser = parsers.INLXmlParser(record)
+ clean_record = inl_parser.clearxml()
+ entities.append(inl_factory.get_entity(clean_record))
+for entity in entities:
+ entity. print_entity()
@@ -0,0 +1,26 @@
new file mode 100755
index 0000000..e69de29
--- /dev/null
+++ b/
diff --git a/writers/ b/writers/
new file mode 100755
index 0000000..e69de29
--- /dev/null
+++ b/writers/
diff --git a/writers/ b/writers/
new file mode 100755
index 0000000..62ab11c
--- /dev/null
+++ b/writers/
@@ -0,0 +1,26 @@
+import pywikibot
+from pywikibot import pagegenerators
+from import wikidataquery
+repo = pywikibot.Site().data_repository()
+def write_to_wd(entity):
+ if entity.viaf:
+ a = get_entity_by_viaf(entity.viaf)
+# Finds the matching record in Wikidata by VIAF identifier
+def get_entity_by_viaf(viaf):
+ sparql = "SELECT ?item WHERE {{ ?item wdt:P214 ?VIAF filter(?VIAF = '{}') }}".format(viaf)
+ entities = pagegenerators.WikidataQueryPageGenerator(sparql)
+ entities = list(entities)
+ if len(entities) == 0:
+ print("No entity found for VIAF: {}".format(viaf))
+ return None
+ elif len(entities) > 1:
+ # TODO: is it possible to have multiple VIAFs?
+ raise Exception('VIAF is expected to be unique')
+ import pdb; pdb.set_trace()
+ return entities[0] \ No newline at end of file