From cfed90c1096a92c4c9e622dfe2d55d892595b2ff Mon Sep 17 00:00:00 2001 From: Ido Ivri Date: Sun, 30 Oct 2016 19:47:28 +0200 Subject: initial commit of work done in DataHack --- entities/__init__.py | 5 ++ entities/basic_entity.py | 5 ++ entities/institution.py | 7 +++ entities/location.py | 27 +++++++++++ entities/person.py | 119 +++++++++++++++++++++++++++++++++++++++++++++++ entities/snaks.py | 91 ++++++++++++++++++++++++++++++++++++ 6 files changed, 254 insertions(+) create mode 100755 entities/__init__.py create mode 100755 entities/basic_entity.py create mode 100755 entities/institution.py create mode 100755 entities/location.py create mode 100755 entities/person.py create mode 100755 entities/snaks.py (limited to 'entities') diff --git a/entities/__init__.py b/entities/__init__.py new file mode 100755 index 0000000..1398576 --- /dev/null +++ b/entities/__init__.py @@ -0,0 +1,5 @@ +from entities.person import Person +from entities.institution import Institution +from entities.location import Location + +from entities.snaks import EntityIdSnak, GeoSnak, MonoLingualStringSnak, SomeValueSnak, StringSnak, TimeSnak, UrlSnak \ No newline at end of file diff --git a/entities/basic_entity.py b/entities/basic_entity.py new file mode 100755 index 0000000..9181422 --- /dev/null +++ b/entities/basic_entity.py @@ -0,0 +1,5 @@ +from libs import JsonSerializable + + +class BasicEntity(JsonSerializable): + pass diff --git a/entities/institution.py b/entities/institution.py new file mode 100755 index 0000000..5fa7570 --- /dev/null +++ b/entities/institution.py @@ -0,0 +1,7 @@ +from entities.basic_entity import BasicEntity + + +class Institution(BasicEntity): + def __init__(self, viaf=None): + super().__init__(viaf) + raise NotImplementedError() diff --git a/entities/location.py b/entities/location.py new file mode 100755 index 0000000..cdec3a6 --- /dev/null +++ b/entities/location.py @@ -0,0 +1,27 @@ +import json + +from entities.basic_entity import BasicEntity + + +class Location(BasicEntity): + def __init__(self, name, types_of_place, name_in_langs, comments_list, viaf): + self.name = name + self.types_of_place = types_of_place + self.name_in_langs = name_in_langs + self.comments_list = comments_list + self.viaf = viaf + + # CSV_FIELDS = ["name", "comments"] + CSV_FIELDS = ["viaf", "name", "types_of_place", "name_in_langs", "comments_list"] + TYPE = "LOCATION" + + + def print_entity(self): + print("Name = " + self.name) + print("Name in langs = " + str(self.name_in_langs)) + print("Types = " + str(self.types_of_place)) + print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py new file mode 100755 index 0000000..46ed315 --- /dev/null +++ b/entities/person.py @@ -0,0 +1,119 @@ +import json + +from entities.snaks import * +from entities.basic_entity import BasicEntity + + +class Person(BasicEntity): + def __init__(self, name, date_of_birth, date_of_death, name_in_langs, bio_data, comments_list, profession, viaf, + national_lib_id): + """ + + :param name: + :param date_of_birth: + :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: + { + "latin": "George" + "heb": "[george in hebrew]" + } + """ + self.name = name + dob = [date_of_birth] + dod = [date_of_death] + self.name_in_langs = name_in_langs + self.national_lib_id = national_lib_id + + bio_data_dict = dict() + struct_bio_data = dict() + for elem in bio_data: + elem_splitted = elem.split(":") + if len(elem_splitted) == 2: + bio_data_key = elem_splitted[0].strip() + bio_data_value = elem_splitted[1].strip() + + if bio_data_key.startswith(u"תאריך לידה: "): + dob.append(bio_data_value) + elif bio_data_key.startswith(u"תאריך פטירה: "): + dod.append(bio_data_value) + elif bio_data_key.startswith(u"מקצוע: ") or bio_data_key.startswith(u"מיקצוע: "): + profession.append(bio_data_value) + else: + struct_bio_data[bio_data_key] = bio_data_value + + if bio_data_key in bio_data_dict: + bio_data_dict.get(bio_data_key).append(bio_data_value) + else: + bio_data_dict.update( + {bio_data_key: [bio_data_value]} + ) + else: + bio_data_dict.update({elem: ''}) + self.bio_data = bio_data_dict + self.comments_list = comments_list + self.profession = profession + self.viaf = viaf + self.date_of_birth = dob + self.date_of_death = dod + self.struct_bio_data = struct_bio_data + + # CSV_FIELDS = ["name", "biodata", "comments", "viaf"] + + CSV_FIELDS = ["678 - biodata", "001 - national lib id"] + TYPE = 'PERSON' + + # CSV_FIELDS = ["viaf", "name", "biodata", "comments"] + CSV_FIELDS = ["viaf", "national_lib_id", "name", "date_of_birth", "date_of_death", "name_in_langs", "bio_data", + "struct_bio_data", "comments_list", "profession"] + TYPE = 'PERSON' + + def print_entity(self): + print("Name = " + self.name) + print("Birth year = " + self.date_of_birth) + print("Death year = " + self.date_of_death) + print("Names in langs = " + str(self.name_in_langs)) + print("Bio Data = " + json.dumps(self.bio_data)) + print("Comments = " + json.dumps(self.comments_list)) + print("Profession = " + json.dumps(self.profession)) + + def to_csv_dict(self): + return {'viaf': self.viaf, 'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} + + def to_wd_claims(self): + claims = [] + + if self.date_of_birth: + claims.append({ + "type": "claim", + "mainsnak": TimeSnak(property='P569', date=self.date_of_birth[0]).to_json() + }) + if self.date_of_death: + claims.append({ + "type": "claim", + "mainsnak": TimeSnak(property='P570', date=self.date_of_death[0]).to_json() + }) + if self.profession: + for elem in self.profession: + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P106', value=elem).to_json() + }) + if self.viaf: + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P214', value=self.viaf).to_json() + }) + if self.struc_bio_data: + for bio_key, bio_value in self.struc_bio_data.items(): + if bio_key.startswith(u"מקום לידה"): + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='P19', value=bio_value).to_json() + }) + if bio_key.startswith(u"מקום פטירה"): + claims.append({ + "type": "claim", + "mainsnak": StringSnak(property='p20', value=bio_value).to_json() + }) + + return claims diff --git a/entities/snaks.py b/entities/snaks.py new file mode 100755 index 0000000..8874329 --- /dev/null +++ b/entities/snaks.py @@ -0,0 +1,91 @@ +from datetime import datetime + +from libs import JsonSerializable + + +class BasicSnak(JsonSerializable): + def __init__(self, snaktype, property, datatype, datavalue): + self.snaktype = snaktype + self.property = property + self.datatype = datatype + self.datavalue = datavalue + + +class StringSnak(BasicSnak): + def __init__(self, property, value): + datavalue = { + "type": "string", + "value": value + } + super().__init__(snaktype="value", property=property, datatype="string", datavalue=datavalue) + + +class MonoLingualStringSnak(BasicSnak): + def __init__(self, property, value, language): + datavalue = { + "type": "monolingualtext", + "value": { + "language": language, + "text": value + } + } + super().__init__(snaktype="value", property=property, datatype="monolingualtext", datavalue=datavalue) + + +class EntityIdSnak(BasicSnak): + def __init__(self, property, entity_type, entity_id): + datavalue = { + "value": { + "entity-type": entity_type, + "numeric-id": entity_id + }, + "type": "wikibase-item" + } + super().__init__(snaktype="value", property=property, datatype="wikibase-entityid", datavalue=datavalue) + + +class UrlSnak(BasicSnak): + def __init__(self, property, url): + datavalue = { + "type": "string", + "value": url + } + super().__init__(snaktype="value", property=property, datatype="url", datavalue=datavalue) + + +class TimeSnak(BasicSnak): + def __init__(self, property, date, precision=11): + if not isinstance(date, datetime): + date = datetime(date) + datavalue = { + "value": { + "time": date.isoformat(), + "timezone": 0, + "before": 0, + "after": 0, + "precision": precision, + "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" + }, + "type": "time" + } + super().__init__(snaktype="value", property=property, datatype="time", datavalue=datavalue) + + +class GeoSnak(BasicSnak): + def __init__(self, latitude, longitude, precision): + datavalue = { + "value": { + "latitude": latitude, + "longitude": longitude, + "altitude": None, + "precision": precision, + "globe": "http:\/\/www.wikidata.org\/entity\/Q2" + }, + "type": "globecoordinate" + } + super().__init__(snaktype="value", property=property, datatype="globe-coordinate", datavalue=datavalue) + + +class SomeValueSnak(BasicSnak): + def __init__(self, property): + super().__init__(snaktype="somevalue", property=property, datatype=None, datavalue=None) -- cgit v1.2.3