From e24a4199fc75f9939c488c46aea3d8ff745a6ba8 Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Thu, 22 Sep 2016 14:13:48 +0300 Subject: updates --- .idea/workspace.xml | 155 +++++++++++++++++++++------------------------- entities/location.py | 3 +- entities/person.py | 29 ++++++--- factories/INL_factory.py | 10 ++- parsers/INL_xml_parser.py | 6 +- readers/xml_reader.py | 12 +++- testers/factorytester.py | 2 +- 7 files changed, 119 insertions(+), 98 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 292b4fc..966bd42 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -3,6 +3,10 @@ + + + + @@ -17,7 +21,7 @@ - + + + + + + + + + + + - - + + @@ -51,32 +65,21 @@ - - + + - - + - - - - - - - - - - - - + + @@ -95,8 +98,8 @@ - - + + @@ -105,8 +108,8 @@ - - + + @@ -114,26 +117,6 @@ - - - - - - - - - - - - - - - - - - - - @@ -160,11 +143,11 @@ @@ -551,7 +534,14 @@ @@ -632,8 +622,7 @@ - - + @@ -694,8 +683,7 @@ - - + @@ -762,22 +750,6 @@ - - - - - - - - - - - - - - - - @@ -786,18 +758,18 @@ - + - - + + - + - - + + @@ -810,30 +782,45 @@ - + - - + + - - + + - - + + + + + + + + + + + + + + + + + - - + + @@ -842,8 +829,8 @@ - - + + diff --git a/entities/location.py b/entities/location.py index cd1ca01..07ef7ff 100644 --- a/entities/location.py +++ b/entities/location.py @@ -2,10 +2,11 @@ from entities.basic_entity import BasicEntity class Location(BasicEntity): - def __init__(self, name, types_of_place, name_in_langs): + def __init__(self, name, types_of_place, name_in_langs, comments_list): self.name = name self.types_of_place = types_of_place self.name_in_langs = name_in_langs + self.comments_list = comments_list def print_entity(self): print("Name = " + self.name) diff --git a/entities/person.py b/entities/person.py index d541bb4..fa04566 100644 --- a/entities/person.py +++ b/entities/person.py @@ -2,7 +2,7 @@ from entities.basic_entity import BasicEntity class Person(BasicEntity): - def __init__(self, name, date_of_birth, name_in_langs, bio_data): + def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list): """ :param name: @@ -22,6 +22,7 @@ class Person(BasicEntity): self.birth_year = date_of_birth.strip() self.death_year = '' self.name_in_langs = name_in_langs + ''' place_of_birth = list() place_of_death = list() profession = list() @@ -37,12 +38,26 @@ class Person(BasicEntity): self.place_of_birth = place_of_birth self.place_of_death = place_of_death self.profession = profession + ''' + bio_data_dict = dict() + for elem in bio_data: + elem_splitted = elem.split(":") + if len(elem_splitted) == 2: + bio_data_key = elem_splitted[0] + bio_data_value = elem_splitted[1] + if bio_data_key in bio_data_dict: + bio_data_dict.get(bio_data_key).append(bio_data_value) + else: + bio_data_dict.update( + {bio_data_key: [bio_data_value]} + ) + else: + bio_data_dict.update({elem: ''}) + self.bio_data = bio_data_dict + self.comments_list = comments_list + def print_entity(self): print("Name = " + self.name) - print("Birth year = " + self.birth_year) - print("Death year = " + self.death_year) - print("Names in langs" + str(self.name_in_langs)) - print("Places of birth = " + str(self.place_of_birth)) - print("Places of death = " + str(self.place_of_death)) - print("profession = " + str(self.profession)) + print("Names in langs = " + str(self.name_in_langs)) + print("Bio Data = " + str(self.bio_data)) diff --git a/factories/INL_factory.py b/factories/INL_factory.py index 6b75f28..e52257f 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -47,6 +47,7 @@ class INLFactory(BasicFactory): name = '' name_in_langs = dict() bio_data = list() + comment_list = list() eng_name = '' date_of_birth = '' #get the names and date of birth and bio data @@ -76,7 +77,9 @@ class INLFactory(BasicFactory): name_in_langs.update({field.text: [name_diff]}) elif tag == 'bio_data': bio_data.append(field.text) - return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data) + elif tag == 'comment': + comment_list.append(field.text) + return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list) #110 is institue elif record_key == '110': return entities.Institution() @@ -84,6 +87,7 @@ class INLFactory(BasicFactory): elif record_key == '151': name_in_langs = dict() types_of_place = list() + comment_list = list() for field in raw_object.getroot(): key = field.attrib.get('tag') tag = entity_keys.get(key) @@ -108,7 +112,9 @@ class INLFactory(BasicFactory): name_in_langs.get(field.text).append(name_diff) else: name_in_langs.update({field.text: [name_diff]}) - return entities.Location(eng_name, types_of_place , name_in_langs) + elif tag == 'comment': + comment_list.append(field.text) + return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) else: raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 968bf55..879dad7 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -5,11 +5,13 @@ except ImportError: KNOWN_FIELD_TAGS = ['100', '110', '151'] +TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451'] class INLXmlParser: - def __init__(self, reader, whitelist=None): + def __init__(self, reader, whitelist=TAG_WHITELIST): self.reader = reader - self.whitelist = whitelist or KNOWN_FIELD_TAGS + #self.whitelist = whitelist or KNOWN_FIELD_TAGS + self.whitelist = whitelist def clearxml(self): newTreeRoot = ET.Element('data') diff --git a/readers/xml_reader.py b/readers/xml_reader.py index bd7821b..0ed07d5 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,5 +1,5 @@ # from __future__ import absolute_import -import parsers +import parsers, factories try: import xml.etree.cElementTree as ET @@ -18,11 +18,21 @@ def read_file(path, element_key): # get the root element event, root = context.__next__() + #the factory + inl_factory = factories.INLFactory() + for event, element in context: if 'end' in event: if element_key in element.tag: + #enter the processing here record_counter += 1 + #cleaned element is a tree cleaned_element = parsers.INLXmlParser(element).clearxml() + entity = inl_factory.get_entity(cleaned_element) + + #test print the entity + entity.print_entity() + # import pdb; pdb.set_trace() print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) element.clear() diff --git a/testers/factorytester.py b/testers/factorytester.py index 121e068..1fb6154 100644 --- a/testers/factorytester.py +++ b/testers/factorytester.py @@ -17,5 +17,5 @@ for record in xmltree.getroot(): entities.append(inl_factory.get_entity(clean_record)) for entity in entities: - print(entity) + entity. print_entity() -- cgit v1.2.3