From e34be2e06f88032824beaec5173419c60602591f Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Thu, 22 Sep 2016 11:51:49 +0300 Subject: tester and person entity --- .idea/misc.xml | 12 +- .idea/modules.xml | 2 +- .idea/parser.iml | 12 ++ .idea/workspace.xml | 517 +++++++++++++++++++++++++++++++++++----------- entities/person.py | 34 ++- factories/INL_factory.py | 73 +++++-- parsers/INL_xml_parser.py | 14 +- parsers/__init__.py | 3 +- testers/factorytester.py | 21 ++ 9 files changed, 524 insertions(+), 164 deletions(-) create mode 100644 .idea/parser.iml create mode 100644 testers/factorytester.py diff --git a/.idea/misc.xml b/.idea/misc.xml index df245c4..f778c9e 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,14 +1,4 @@ - - - - - - - - - - - + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml index 9a7bd2d..405d108 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/parser.iml b/.idea/parser.iml new file mode 100644 index 0000000..6f63a63 --- /dev/null +++ b/.idea/parser.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index f2b732c..69eda32 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,9 +1,16 @@ - + + + + + + + + - + @@ -16,7 +23,7 @@ - + - - + + - - + + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -49,14 +132,35 @@ + + + @@ -69,8 +173,8 @@ @@ -101,87 +205,17 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -192,11 +226,28 @@ - + - - + + + - - - + + + + + + - - 1474498136177 + + 1474481137431 + + 1474490333649 + + + 1474498941858 + + - + - + - + - - - + + + - + @@ -475,12 +528,34 @@ + + + - + + - + + + + + + + + + + + + + + + + + @@ -488,7 +563,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -496,21 +731,51 @@ - + - - + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + \ No newline at end of file diff --git a/entities/person.py b/entities/person.py index b9e9d78..d541bb4 100644 --- a/entities/person.py +++ b/entities/person.py @@ -2,7 +2,7 @@ from entities.basic_entity import BasicEntity class Person(BasicEntity): - def __init__(self, name, date_of_birth, name_in_langs): + def __init__(self, name, date_of_birth, name_in_langs, bio_data): """ :param name: @@ -14,5 +14,35 @@ class Person(BasicEntity): } """ self.name = name - self.date_of_birth = date_of_birth + years_parts = date_of_birth.split('-') + if (len(years_parts) == 2): + self.birth_year = years_parts[0] + self.death_year = years_parts[1] + else: + self.birth_year = date_of_birth.strip() + self.death_year = '' self.name_in_langs = name_in_langs + place_of_birth = list() + place_of_death = list() + profession = list() + for comment in bio_data: + encoded_comment = ''.join(comment).strip() + if encoded_comment.startswith(u"מקום לידה: "): + place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2]) + if encoded_comment.startswith(u"מקום פטירה: "): + place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + if encoded_comment.startswith(u"מקצוע: "): + profession.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + + self.place_of_birth = place_of_birth + self.place_of_death = place_of_death + self.profession = profession + + def print_entity(self): + print("Name = " + self.name) + print("Birth year = " + self.birth_year) + print("Death year = " + self.death_year) + print("Names in langs" + str(self.name_in_langs)) + print("Places of birth = " + str(self.place_of_birth)) + print("Places of death = " + str(self.place_of_death)) + print("profession = " + str(self.profession)) diff --git a/factories/INL_factory.py b/factories/INL_factory.py index b61c487..adc5b1a 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -1,5 +1,6 @@ import entities from factories import BasicFactory +import xml.etree.cElementTree as ET TAG_TO_ENTITY_MAPPING = { '100': entities.Person, @@ -10,15 +11,20 @@ TAG_TO_ENTITY_MAPPING = { ENTITY_KEYS = { '100.a': 'name', + '100.9': 'name_langindic', '100.d': 'date_of_birth', '400.a': 'name_in_langs', + '400.9': 'langs_langindic', + '678.a': 'bio_data', '151.a': 'name', + '151.9': 'name_langindic', '451:a': 'name_in_langs', + '451:9': 'langs_langindic', '550.a': 'type' } -def get_record_key(self, record): +def get_record_key(record): root = record.getroot() for field in root: field_tag = field.attrib.get('tag') @@ -33,40 +39,73 @@ class INLFactory(BasicFactory): def __init__(self, tag_to_entity_mapping=None): self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING - def get_entity(self, entity_keys, raw_object): + def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): record_key = get_record_key(raw_object) if record_key == '100': - has_name = False - name_in_langs = [] + name = '' + name_in_langs = dict() + bio_data = list() + eng_name = '' + date_of_birth = '' + #get the names and date of birth and bio data for field in raw_object.getroot(): key = field.attrib.get('tag') tag = entity_keys.get(key) - if tag == 'name' and not has_name: + if tag == 'name': name = field.text - has_name = True + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) elif tag == 'date_of_birth': date_of_birth = field.text elif tag == 'name_in_langs': - name_in_langs.append(field.text) - return entities.Person(name, date_of_birth, name_in_langs) + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'bio_data': + bio_data.append(field.text) + return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data) elif record_key == '110': return entities.Institution() elif record_key == '151': - has_name = False - name_in_langs = [] - type = [] + name_in_langs = dict() + types = [] for field in raw_object.getroot(): key = field.attrib.get('tag') tag = entity_keys.get(key) - if tag == 'name' and not has_name: + if tag == 'name': name = field.text - has_name = True + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) elif tag == 'type': - type.append(field.text) + types.append(field.text) elif tag == 'name_in_langs': - name_in_langs.append(field.text) - return entities.Location(name, type, name_in_langs) + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name]}) + return entities.Location(eng_name, types, name_in_langs) else: - raise KeyError('Key {} was not recognized for factory {}'.format(entity_key, type(self))) + raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 4cd04ef..968bf55 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -20,14 +20,16 @@ class INLXmlParser: for field in self.reader: fieldtag = field.attrib.get('tag') if fieldtag in self.whitelist: - newFieldTag = fieldtag + temptag = fieldtag # tag 700 and 400 are the same - if newFieldTag == '700': - newFieldTag = '400' - newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + if temptag == '700': + temptag = '400' for data in field: - subData = ET.SubElement(newTag, data.tag, data.attrib) - subData.text = data.text + newFieldTag = temptag + newFieldTag += '.' + newFieldTag += data.attrib.get('code') + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + newTag.text = data.text newRecordTree = ET.ElementTree(newRecord) return newRecordTree diff --git a/parsers/__init__.py b/parsers/__init__.py index e3a246d..d32c917 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -1 +1,2 @@ -from INL_xml_parser import INLXmlParser \ No newline at end of file + +from .INL_xml_parser import INLXmlParser \ No newline at end of file diff --git a/testers/factorytester.py b/testers/factorytester.py new file mode 100644 index 0000000..cc95bab --- /dev/null +++ b/testers/factorytester.py @@ -0,0 +1,21 @@ +from __future__ import absolute_import +import parsers +import factories +import xml.etree.cElementTree as ET + +xmlpath = 'C:/Users/Ilsar/Documents/datahack/xml_example.xml' +whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] + + +xmltree = ET.parse(xmlpath) +entities = list() +inl_factory = factories.INLFactory() + +for record in xmltree.getroot(): + inl_parser = parsers.INLXmlParser(record, whitelist) + clean_record = inl_parser.clearxml() + entities.append(inl_factory.get_entity(clean_record)) + +for entity in entities: + entity.print_entity() + -- cgit v1.2.3