From cfed90c1096a92c4c9e622dfe2d55d892595b2ff Mon Sep 17 00:00:00 2001 From: Ido Ivri Date: Sun, 30 Oct 2016 19:47:28 +0200 Subject: initial commit of work done in DataHack --- .gitignore | 138 ++++++++++++++++++++++++++++++++++++++++++ .idea/misc.xml | 4 ++ .idea/modules.xml | 8 +++ .idea/parser.iml | 12 ++++ __init__.py | 0 clustering_678.py | 100 +++++++++++++++++++++++++++++++ entities/__init__.py | 5 ++ entities/basic_entity.py | 5 ++ entities/institution.py | 7 +++ entities/location.py | 27 +++++++++ entities/person.py | 119 +++++++++++++++++++++++++++++++++++++ entities/snaks.py | 91 ++++++++++++++++++++++++++++ factories/INL_factory.py | 145 +++++++++++++++++++++++++++++++++++++++++++++ factories/__init__.py | 2 + factories/basic_factory.py | 3 + libs/__init__.py | 1 + libs/json_tools.py | 12 ++++ parsers/INL_xml_parser.py | 41 +++++++++++++ parsers/__init__.py | 2 + parsers/basic_parser.py | 6 ++ readers/__init__.py | 0 readers/xml_reader.py | 58 ++++++++++++++++++ testers/factorytester.py | 19 ++++++ user-config.py | 0 writers/__init__.py | 0 writers/wd_writer.py | 26 ++++++++ 26 files changed, 831 insertions(+) create mode 100755 .gitignore create mode 100755 .idea/misc.xml create mode 100755 .idea/modules.xml create mode 100755 .idea/parser.iml create mode 100755 __init__.py create mode 100755 clustering_678.py create mode 100755 entities/__init__.py create mode 100755 entities/basic_entity.py create mode 100755 entities/institution.py create mode 100755 entities/location.py create mode 100755 entities/person.py create mode 100755 entities/snaks.py create mode 100755 factories/INL_factory.py create mode 100755 factories/__init__.py create mode 100755 factories/basic_factory.py create mode 100755 libs/__init__.py create mode 100755 libs/json_tools.py create mode 100755 parsers/INL_xml_parser.py create mode 100755 parsers/__init__.py create mode 100755 parsers/basic_parser.py create mode 100755 readers/__init__.py create mode 100755 readers/xml_reader.py create mode 100755 testers/factorytester.py create mode 100755 user-config.py create mode 100755 writers/__init__.py create mode 100755 writers/wd_writer.py diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..7ebdd82 --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/workspace.xml +.idea/tasks.xml +.idea/dictionaries +.idea/vcs.xml +.idea/jsLibraryMappings.xml + +# Sensitive or high-churn files: +.idea/dataSources.ids +.idea/dataSources.xml +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +.out/* \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100755 index 0000000..f778c9e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100755 index 0000000..405d108 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/parser.iml b/.idea/parser.iml new file mode 100755 index 0000000..6f63a63 --- /dev/null +++ b/.idea/parser.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/clustering_678.py b/clustering_678.py new file mode 100755 index 0000000..99391ac --- /dev/null +++ b/clustering_678.py @@ -0,0 +1,100 @@ +# coding=utf-8 +import pandas as pd +import unicodedata +from sklearn.cluster import KMeans + +PERSON_001_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\PERSON - 001.csv" +FEATURES_TABLE_PATH = r"C:\Users\Emanuel\Desktop\DataHack16\features_678.csv" + + +# We find all keys with an empty value: +def return_keys_without_value(dic): + keys = [] + for key, value in dic.items(): + if value == "": # todo: take care of keys without value that do contain : somewhere + keys.append(key) + if len(keys) > 0: + return keys + else: + return None + + +data = pd.read_csv(PERSON_001_PATH, names=["string", "id"]) +data = data.dropna() +data = data[data["string"] != "{}"] +data["string"] = [eval(k) for k in data["string"]] +data["string"] = [return_keys_without_value(dic) for dic in data["string"]] +data = data.dropna() + +string_list = [] +id_list = [] +for _, row in data.iterrows(): + for elem in row["string"]: + string_list.append(elem) + id_list.append(row["id"]) +new_data = pd.DataFrame({"string": string_list, "id": id_list}) +new_data.to_csv(r"C:\Users\Emanuel\Desktop\DataHack16\data_separated.csv", encoding="utf-8") + + +def is_all_hebrew(s): + try: + s = s.decode("utf-8") + except AttributeError: + pass + + # remove all non-characters: + q = "" + for i in s: + if i.isalpha(): + q = "".join([q, i]) + + return all('HEBREW' in unicodedata.name(c) for c in q) + + +def is_all_english(s): + try: + s = s.decode("utf-8") + except AttributeError: + pass + + # remove all non-characters: + chars_only = "" + for i in s: + if i.isalpha(): + chars_only = "".join([chars_only, i]) + return all('LATIN' in unicodedata.name(c) for c in chars_only) + + +def count_words(s): + return len(s.split()) + + +# todo: add a feature "contains_predefined_year_prefixes", like b. or d. +# todo: add a feature that checks whether the string contains a number that is not a year (i.e not in the range ...) +# todo: detect hebrew years using quotes +new_data["is_all_hebrew"] = new_data["string"].apply(is_all_hebrew) +new_data["is_all_english"] = new_data["string"].apply(is_all_english) +new_data["number_of_words"] = new_data["string"].apply(count_words) +new_data["contains_quote"] = new_data["string"].apply(lambda s: '"' in s) +new_data["contains_colon"] = new_data["string"].apply(lambda s: ':' in s) +new_data.to_csv(FEATURES_TABLE_PATH) + +X = new_data.copy() +assert isinstance(X, pd.DataFrame) +del X["id"] +del X["string"] +print(X.columns) +X = (X - X.mean()) / (X.max() - X.min()) # normalizing the features + +range_n_clusters = [4, 6] +for n_clusters in range_n_clusters: + clusterer = KMeans(n_clusters=n_clusters, random_state=10) + cluster_labels = clusterer.fit_predict(X) + centers = clusterer.cluster_centers_ + print("\n %s clusters:" % n_clusters) + print("cluster labels: %s" % cluster_labels) + print("cluster centers: %s " % centers) + + for k in range(n_clusters): + print("\ncluster %d consists of the following strings:" % k) + print(new_data["string"][cluster_labels == k]) diff --git a/entities/__init__.py b/entities/__init__.py new file mode 100755 index 0000000..1398576 --- /dev/null +++ b/entities/__init__.py @@ -0,0 +1,5 @@ +from entities.person import Person +from entities.institution import Institution +from entities.location import Location + +from entities.snaks import EntityIdSnak, GeoSnak, MonoLingualStringSnak, SomeValueSnak, StringSnak, TimeSnak, UrlSnak \ No newline at end of file diff --git a/entities/basic_entity.py b/entities/basic_entity.py new file mode 100755 index 0000000..9181422 --- /dev/null +++ b/entities/basic_entity.py @@ -0,0 +1,5 @@ +from libs import JsonSerializable + + +class BasicEntity(JsonSerializable): + pass diff --git a/entities/institution.py b/entities/institution.py new file mode 100755 index 0000000..5fa7570 --- /dev/null +++ b/entities/institution.py @@ -0,0 +1,7 @@ +from entities.basic_entity import BasicEntity + + +class Institution(BasicEntity): + def __init__(self, viaf=None): + super().__init__(viaf) + raise NotImplementedError() diff --git a/entities/location.py b/entities/location.py new file mode 100755 index 0000000..cdec3a6 --- /dev/null +++ b/entities/location.py @@ -0,0 +1,27 @@ +import json + +from entities.basic_entity import BasicEntity + + +class Location(BasicEntity): + def __init__(self, name, types_of_place, name_in_langs, comments_list, viaf): + self.name = name + self.types_of_place = types_of_place + self.name_in_langs = name_in_langs + self.comments_list = comments_list + self.viaf = viaf + + # CSV_FIELDS = ["name", "comments"] + CSV_FIELDS = ["viaf", "name", "types_of_place", "name_in_langs", "comments_list"] + TYPE = "LOCATION" + + + def print_entity(self): + print("Name = " + self.name) + print("Name in langs = " + str(self.name_in_langs)) + print("Types = " + str(self.types_of_place)) + print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py new file mode 100755 index 0000000..46ed315 --- /dev/null +++ b/entities/person.py @@ -0,0 +1,119 @@ +import json + +from entities.snaks import * +from entities.basic_entity import BasicEntity + + +class Person(BasicEntity): + def __init__(self, name, date_of_birth, date_of_death, name_in_langs, bio_data, comments_list, profession, viaf, + national_lib_id): + """ + + :param name: + :param date_of_birth: + :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: + { + "latin": "George" + "heb": "[george in hebrew]" + } + """ + self.name = name + dob = [date_of_birth] + dod = [date_of_death] + self.name_in_langs = name_in_langs + self.national_lib_id = national_lib_id + + bio_data_dict = dict() + struct_bio_data = dict() + for elem in bio_data: + elem_splitted = elem.split(":") + if len(elem_splitted) == 2: + bio_data_key = elem_splitted[0].strip() + bio_data_value = elem_splitted[1].strip() + + if bio_data_key.startswith(u"תאריך לידה: "): + dob.append(bio_data_value) + elif bio_data_key.startswith(u"תאריך פטירה: "): + dod.append(bio_data_value) + elif bio_data_key.startswith(u"מקצוע: ") or bio_data_key.startswith(u"מיקצוע: "): + profession.append(bio_data_value) + else: + struct_bio_data[bio_data_key] = bio_data_value + + if bio_data_key in bio_data_dict: + bio_data_dict.get(bio_data_key).append(bio_data_value) + else: + bio_data_dict.update( + {bio_data_key: [bio_data_value]} + ) + else: + bio_data_dict.update({elem: ''}) + self.bio_data = bio_data_dict + self.comments_list = comments_list + self.profession = profession + self.viaf = viaf + self.date_of_birth = dob + self.date_of_death = dod + self.struct_bio_data = struct_bio_data + + # CSV_FIELDS = ["name", "biodata", "comments", "viaf"] + + CSV_FIELDS = ["678 - biodata", "001 - national lib id"] + TYPE = 'PERSON' + + # CSV_FIELDS = ["viaf", "name", "biodata", "comments"] + CSV_FIELDS = ["viaf", "national_lib_id", "name", "date_of_birth", "date_of_death", "name_in_langs", "bio_data", + "struct_bio_data", "comments_list", "profession"] + TYPE = 'PERSON' + + def print_entity(self): + print("Name = " + self.name) + print("Birth year = " + self.date_of_birth) + print("Death year = " + self.date_of_death) + print("Names in langs = " + str(self.name_in_langs)) + print("Bio Data = " + json.dumps(self.bio_data)) + print("Comments = " + json.dumps(self.comments_list)) + print("Profession = " + json.dumps(self.profession)) + + def to_csv_dict(self): + return {'viaf': self.viaf, 'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} + + def to_wd_claims(self): + claims = [] + + if self.date_of_birth: + claims.append({ + "type": "claim", + "mainsnak": TimeSnak(property='P569', date=self.date_of_birth[0]).to_json() + }) + if self.date_of_death: + claims.append({ + "type": "claim", + "mainsnak": TimeSnak(property='P570', date=self.date_of_death[0]).to_json() + }) + if self.profession: + for elem in self.profession: + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P106', value=elem).to_json() + }) + if self.viaf: + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P214', value=self.viaf).to_json() + }) + if self.struc_bio_data: + for bio_key, bio_value in self.struc_bio_data.items(): + if bio_key.startswith(u"מקום לידה"): + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P19', value=bio_value).to_json() + }) + if bio_key.startswith(u"מקום פטירה"): + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='p20', value=bio_value).to_json() + }) + + return claims diff --git a/entities/snaks.py b/entities/snaks.py new file mode 100755 index 0000000..8874329 --- /dev/null +++ b/entities/snaks.py @@ -0,0 +1,91 @@ +from datetime import datetime + +from libs import JsonSerializable + + +class BasicSnak(JsonSerializable): + def __init__(self, snaktype, property, datatype, datavalue): + self.snaktype = snaktype + self.property = property + self.datatype = datatype + self.datavalue = datavalue + + +class StringSnak(BasicSnak): + def __init__(self, property, value): + datavalue = { + "type": "string", + "value": value + } + super().__init__(snaktype="value", property=property, datatype="string", datavalue=datavalue) + + +class MonoLingualStringSnak(BasicSnak): + def __init__(self, property, value, language): + datavalue = { + "type": "monolingualtext", + "value": { + "language": language, + "text": value + } + } + super().__init__(snaktype="value", property=property, datatype="monolingualtext", datavalue=datavalue) + + +class EntityIdSnak(BasicSnak): + def __init__(self, property, entity_type, entity_id): + datavalue = { + "value": { + "entity-type": entity_type, + "numeric-id": entity_id + }, + "type": "wikibase-item" + } + super().__init__(snaktype="value", property=property, datatype="wikibase-entityid", datavalue=datavalue) + + +class UrlSnak(BasicSnak): + def __init__(self, property, url): + datavalue = { + "type": "string", + "value": url + } + super().__init__(snaktype="value", property=property, datatype="url", datavalue=datavalue) + + +class TimeSnak(BasicSnak): + def __init__(self, property, date, precision=11): + if not isinstance(date, datetime): + date = datetime(date) + datavalue = { + "value": { + "time": date.isoformat(), + "timezone": 0, + "before": 0, + "after": 0, + "precision": precision, + "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" + }, + "type": "time" + } + super().__init__(snaktype="value", property=property, datatype="time", datavalue=datavalue) + + +class GeoSnak(BasicSnak): + def __init__(self, latitude, longitude, precision): + datavalue = { + "value": { + "latitude": latitude, + "longitude": longitude, + "altitude": None, + "precision": precision, + "globe": "http:\/\/www.wikidata.org\/entity\/Q2" + }, + "type": "globecoordinate" + } + super().__init__(snaktype="value", property=property, datatype="globe-coordinate", datavalue=datavalue) + + +class SomeValueSnak(BasicSnak): + def __init__(self, property): + super().__init__(snaktype="somevalue", property=property, datatype=None, datavalue=None) diff --git a/factories/INL_factory.py b/factories/INL_factory.py new file mode 100755 index 0000000..72b618c --- /dev/null +++ b/factories/INL_factory.py @@ -0,0 +1,145 @@ +import entities +from factories import BasicFactory +import xml.etree.cElementTree as ET + +TAG_TO_ENTITY_MAPPING = { + '100': entities.Person, + '110': entities.Institution, + '151': entities.Location +} + + +ENTITY_KEYS = { + '100.a': 'name', + '100.9': 'name_langindic', + '046.f': 'date_of_birth', + '046.g': 'date_of_death', + '400.a': 'name_in_langs', + '400.9': 'langs_langindic', + '678.a': 'bio_data', + '151.a': 'name', + '151.9': 'name_langindic', + '451:a': 'name_in_langs', + '451:9': 'langs_langindic', + '550.a': 'type_of_place', + '667.a': 'comment', + '374.a': 'profession', + '901.a': 'viaf', + '001.' : 'national_lib_id', + '001' : 'national_lib_id', +} + + +def get_record_key(record): + root = record.getroot() + for field in root: + field_tag = field.attrib.get('tag') + if '100' in field_tag: + return '100' + if '151' in field_tag: + return '151' + if '110' in field_tag: + return '110' + +class INLFactory(BasicFactory): + def __init__(self, tag_to_entity_mapping=None): + self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING + + def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): + record_key = get_record_key(raw_object) + #100 is person + if record_key == '100': + name = '' + name_in_langs = dict() + bio_data = list() + comment_list = list() + eng_name = '' + profession = list() + name_diff = '' + date_of_birth = '' + date_of_death = '' + viaf = '' + national_lib_id = '' + #get the names and date of birth and bio data + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'date_of_birth': + date_of_birth = field.text + elif tag == 'date_of_death': + date_of_death = field.text + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'bio_data': + bio_data.append(field.text) + elif tag == 'comment': + comment_list.append(field.text) + elif tag == 'profession': + profession.append(field.text) + elif tag == 'viaf': + viaf = field.text + elif tag == 'national_lib_id': + national_lib_id = field.text + return entities.Person(eng_name, date_of_birth, date_of_death, name_in_langs, bio_data, comment_list, profession, viaf, national_lib_id) + #110 is institue + elif record_key == '110': + return entities.Institution() + #151 is location + elif record_key == '151': + name_in_langs = dict() + types_of_place = list() + comment_list = list() + eng_name = '' + name_diff = '' + viaf = '' + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'type_of_place': + types_of_place.append(field.text) + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'comment': + comment_list.append(field.text) + elif tag == 'viaf': + viaf = field.text + return entities.Location(eng_name, types_of_place , name_in_langs, comment_list, viaf) + else: + return None + # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) + + diff --git a/factories/__init__.py b/factories/__init__.py new file mode 100755 index 0000000..86901f5 --- /dev/null +++ b/factories/__init__.py @@ -0,0 +1,2 @@ +from factories.basic_factory import BasicFactory +from factories.INL_factory import INLFactory \ No newline at end of file diff --git a/factories/basic_factory.py b/factories/basic_factory.py new file mode 100755 index 0000000..1715846 --- /dev/null +++ b/factories/basic_factory.py @@ -0,0 +1,3 @@ +class BasicFactory(object): + def get_entity(self, entity_key, raw_object): + raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self))) diff --git a/libs/__init__.py b/libs/__init__.py new file mode 100755 index 0000000..c2514b7 --- /dev/null +++ b/libs/__init__.py @@ -0,0 +1 @@ +from libs.json_tools import JsonSerializable \ No newline at end of file diff --git a/libs/json_tools.py b/libs/json_tools.py new file mode 100755 index 0000000..5c26b24 --- /dev/null +++ b/libs/json_tools.py @@ -0,0 +1,12 @@ +import json + + +class JsonSerializable(object): + def __repr__(self): + return str(self.to_json()) + + def to_json(self): + return json.dumps(self.__dict__, ensure_ascii=False) + + def to_dict(self): + return self.__dict__ \ No newline at end of file diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py new file mode 100755 index 0000000..f90e778 --- /dev/null +++ b/parsers/INL_xml_parser.py @@ -0,0 +1,41 @@ +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] + +TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374', '046', '901', '001'] + + +class INLXmlParser: + def __init__(self, reader, whitelist=TAG_WHITELIST): + self.reader = reader + # self.whitelist = whitelist or KNOWN_FIELD_TAGS + self.whitelist = whitelist + + def clearxml(self): + + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.Element('record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + temptag = fieldtag + if fieldtag == '001': + newTag = ET.SubElement(newRecord, 'datafield', {'tag': '001'}) + newTag.text = field.text + else: + # tag 700 and 400 are the same + if temptag == '700': + temptag = '400' + for data in field: + newFieldTag = temptag + newFieldTag += '.' + newFieldTag += data.attrib.get('code') + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + newTag.text = data.text + + newRecordTree = ET.ElementTree(newRecord) + return ET.ElementTree(newRecord) diff --git a/parsers/__init__.py b/parsers/__init__.py new file mode 100755 index 0000000..d32c917 --- /dev/null +++ b/parsers/__init__.py @@ -0,0 +1,2 @@ + +from .INL_xml_parser import INLXmlParser \ No newline at end of file diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py new file mode 100755 index 0000000..dae19cb --- /dev/null +++ b/parsers/basic_parser.py @@ -0,0 +1,6 @@ +class BasicParser(object): + def __init__(self): + pass + + def parse(self, data): + raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) diff --git a/readers/__init__.py b/readers/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/readers/xml_reader.py b/readers/xml_reader.py new file mode 100755 index 0000000..4d8374e --- /dev/null +++ b/readers/xml_reader.py @@ -0,0 +1,58 @@ +from __future__ import absolute_import +import json +import csv +import parsers +import factories +from entities import Person +from writers.wd_writer import get_entity_by_viaf + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + # the factory + inl_factory = factories.INLFactory() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + # enter the processing here + record_counter += 1 + + # cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + # test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + #writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + #writer.writerow(entity.to_dict()) + + if entity.viaf: + print(get_entity_by_viaf(entity.viaf)) + + # TODO analys and upload the entity + element.clear() + print(record_counter) + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') diff --git a/testers/factorytester.py b/testers/factorytester.py new file mode 100755 index 0000000..b6029ca --- /dev/null +++ b/testers/factorytester.py @@ -0,0 +1,19 @@ +from __future__ import absolute_import +import parsers +import factories +import xml.etree.cElementTree as ET + +xmlpath = 'C:/Users/Ilsar/Documents/datahack/xml_example.xml' + +xmltree = ET.parse(xmlpath) +entities = list() +inl_factory = factories.INLFactory() + +for record in xmltree.getroot(): + inl_parser = parsers.INLXmlParser(record) + clean_record = inl_parser.clearxml() + entities.append(inl_factory.get_entity(clean_record)) + +for entity in entities: + entity. print_entity() + diff --git a/user-config.py b/user-config.py new file mode 100755 index 0000000..e69de29 diff --git a/writers/__init__.py b/writers/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/writers/wd_writer.py b/writers/wd_writer.py new file mode 100755 index 0000000..62ab11c --- /dev/null +++ b/writers/wd_writer.py @@ -0,0 +1,26 @@ +import pywikibot +from pywikibot import pagegenerators +from pywikibot.data import wikidataquery + +repo = pywikibot.Site().data_repository() + + +def write_to_wd(entity): + if entity.viaf: + a = get_entity_by_viaf(entity.viaf) + + +# Finds the matching record in Wikidata by VIAF identifier +def get_entity_by_viaf(viaf): + sparql = "SELECT ?item WHERE {{ ?item wdt:P214 ?VIAF filter(?VIAF = '{}') }}".format(viaf) + + entities = pagegenerators.WikidataQueryPageGenerator(sparql) + entities = list(entities) + if len(entities) == 0: + print("No entity found for VIAF: {}".format(viaf)) + return None + elif len(entities) > 1: + # TODO: is it possible to have multiple VIAFs? + raise Exception('VIAF is expected to be unique') + import pdb; pdb.set_trace() + return entities[0] \ No newline at end of file -- cgit v1.2.3