From 4d108fca731bbd152ad058d007e2de6440f2c98a Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Wed, 21 Sep 2016 22:49:44 +0300 Subject: change the xml cleaner to set tags as ###.* --- .gitignore | 15 --------------- .idea/lib2wiki.iml | 11 ----------- .idea/misc.xml | 12 +++++++++++- .idea/modules.xml | 2 +- .idea/parser.iml | 12 ++++++++++++ parsers/INL_xml_parser.py | 41 ++++++++++++++++++++--------------------- 6 files changed, 44 insertions(+), 49 deletions(-) delete mode 100644 .idea/lib2wiki.iml create mode 100644 .idea/parser.iml diff --git a/.gitignore b/.gitignore index 5e64fdb..7e99e36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1 @@ -# User-specific stuff: -.idea/workspace.xml -.idea/tasks.xml -.idea/dictionaries -.idea/vcs.xml -.idea/jsLibraryMappings.xml - -# Sensitive or high-churn files: -.idea/dataSources.ids -.idea/dataSources.xml -.idea/dataSources.local.xml -.idea/sqlDataSources.xml -.idea/dynamic.xml -.idea/uiDesigner.xml - *.pyc \ No newline at end of file diff --git a/.idea/lib2wiki.iml b/.idea/lib2wiki.iml deleted file mode 100644 index 6711606..0000000 --- a/.idea/lib2wiki.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index de9bbc8..8e8cee7 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,14 @@ - + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml index 9a7bd2d..405d108 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/parser.iml b/.idea/parser.iml new file mode 100644 index 0000000..6f63a63 --- /dev/null +++ b/.idea/parser.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 2ea9a9b..671d2d9 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -6,29 +6,28 @@ class INLXmlParser: self.whitelist = lst self.xmlpath = path - def clearxml(self): - xmltree = ET.parse(self.xmlpath) - # root == list of records - root = xmltree.getroot() - # create new data - newTreeRoot = ET.Element('data') + #expects to get a record as ElementTree + def clearxml(self, record): + # root == list of tags + root = record.getroot() # scan the datafields in the records and copy to the new one only the tags in the whitelist - for record in root: - # create new record - newRecord = ET.SubElement(newTreeRoot, 'record') - for field in record: - fieldtag = field.attrib.get('tag') - if fieldtag in self.whitelist: - newFieldTag = fieldtag - # tag 700 and 400 are the same - if newFieldTag == '700': - newFieldTag = '400' + # create new record + newRecord = ET.Element('record') + for field in root: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + tempTag = fieldtag + # tag 700 and 400 are the same + if tempTag == '700': + tempTag = '400' + for data in field: + newFieldTag = tempTag + newFieldTag +='.' + newFieldTag += data.attrib.get('code') newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) - for data in field: - subData = ET.SubElement(newTag, data.tag, data.attrib) - subData.text = data.text + newTag.text = data.text - newTree = ET.ElementTree(newTreeRoot) - return newTree + newRecordTree = ET.ElementTree(newRecord) + return newRecordTree -- cgit v1.2.3 From 081eac29a20ab8485f2b8180654a6d4b808e2df7 Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Wed, 21 Sep 2016 23:38:53 +0300 Subject: implemented the factory --- .idea/workspace.xml | 151 +++++++++++++++++++++++++++++++---------------- entities/location.py | 4 +- factories/INL_factory.py | 61 +++++++++++++++++-- 3 files changed, 158 insertions(+), 58 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 5f621c3..c88ce20 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,11 +2,9 @@ - - - - - + + + @@ -37,11 +35,45 @@ + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -86,14 +118,15 @@ @@ -439,20 +472,20 @@ - - + + + - + + - - @@ -483,7 +516,11 @@ - + + + + + @@ -495,14 +532,6 @@ - - - - - - - - @@ -510,17 +539,11 @@ - - - - - - - + @@ -532,14 +555,6 @@ - - - - - - - - @@ -548,14 +563,6 @@ - - - - - - - - @@ -580,6 +587,14 @@ + + + + + + + + @@ -588,13 +603,49 @@ - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/entities/location.py b/entities/location.py index 8130632..9636c3b 100644 --- a/entities/location.py +++ b/entities/location.py @@ -2,7 +2,7 @@ from entities.basic_entity import BasicEntity class Location(BasicEntity): - def __init__(self, name, types, coordinates): + def __init__(self, name, types, name_in_langs): self.name = name self.types = types - self.coordinates = coordinates + self.name_in_langs = name_in_langs diff --git a/factories/INL_factory.py b/factories/INL_factory.py index 6607368..b61c487 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -8,16 +8,65 @@ TAG_TO_ENTITY_MAPPING = { } +ENTITY_KEYS = { + '100.a': 'name', + '100.d': 'date_of_birth', + '400.a': 'name_in_langs', + '151.a': 'name', + '451:a': 'name_in_langs', + '550.a': 'type' +} + + +def get_record_key(self, record): + root = record.getroot() + for field in root: + field_tag = field.attrib.get('tag') + if '100' in field_tag: + return '100' + if '151' in field_tag: + return '151' + if '110' in field_tag: + return '110' + class INLFactory(BasicFactory): def __init__(self, tag_to_entity_mapping=None): self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING - def get_entity(self, entity_key, raw_object): - if entity_key == '100': - return entities.Person('', '', '') - elif entity_key == '110': + def get_entity(self, entity_keys, raw_object): + record_key = get_record_key(raw_object) + if record_key == '100': + has_name = False + name_in_langs = [] + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name' and not has_name: + name = field.text + has_name = True + elif tag == 'date_of_birth': + date_of_birth = field.text + elif tag == 'name_in_langs': + name_in_langs.append(field.text) + return entities.Person(name, date_of_birth, name_in_langs) + elif record_key == '110': return entities.Institution() - elif entity_key == '151': - return entities.Location('', '', '') + elif record_key == '151': + has_name = False + name_in_langs = [] + type = [] + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name' and not has_name: + name = field.text + has_name = True + elif tag == 'type': + type.append(field.text) + elif tag == 'name_in_langs': + name_in_langs.append(field.text) + return entities.Location(name, type, name_in_langs) else: raise KeyError('Key {} was not recognized for factory {}'.format(entity_key, type(self))) + + -- cgit v1.2.3