From 6405185cd4136b04b45b3b9d756fdd5d38405f07 Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Thu, 22 Sep 2016 14:53:42 +0300 Subject: updates --- .idea/workspace.xml | 111 ++++++++++++++++++++++++++++------------------ factories/INL_factory.py | 3 +- parsers/INL_xml_parser.py | 5 +-- readers/xml_reader.py | 12 +++-- 4 files changed, 81 insertions(+), 50 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d837dbc..bf6f1e4 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -3,12 +3,9 @@ - - - @@ -21,7 +18,8 @@ - + + - + - - + + @@ -45,8 +43,8 @@ - - + + @@ -65,8 +63,8 @@ - - + + @@ -75,7 +73,7 @@ - + @@ -108,8 +106,8 @@ - - + + @@ -144,11 +142,11 @@ @@ -245,7 +243,7 @@ - + + + - + + - - + + + @@ -548,12 +566,19 @@ - + @@ -774,14 +799,6 @@ - - - - - - - - @@ -798,26 +815,34 @@ - + - - + + - + - + + + + + + + + + - - + + @@ -827,18 +852,18 @@ - - + + - + - - + + diff --git a/factories/INL_factory.py b/factories/INL_factory.py index e9838f4..8bf2348 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -120,6 +120,7 @@ class INLFactory(BasicFactory): comment_list.append(field.text) return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) else: - raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) + return None + # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 3d9b1b7..1a06f6b 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -14,11 +14,10 @@ class INLXmlParser: self.whitelist = whitelist def clearxml(self): - newTreeRoot = ET.Element('data') # # scan the datafields in the records and copy to the new one only the tags in the whitelist # for record in root: # create new record - newRecord = ET.SubElement(newTreeRoot, 'record') + newRecord = ET.Element('record') for field in self.reader: fieldtag = field.attrib.get('tag') if fieldtag in self.whitelist: @@ -34,4 +33,4 @@ class INLXmlParser: newTag.text = data.text newRecordTree = ET.ElementTree(newRecord) - return newRecordTree + return ET.ElementTree(newRecord) diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 3e630cb..ec2c696 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -26,12 +26,18 @@ def read_file(path, element_key): if element_key in element.tag: #enter the processing here record_counter += 1 + + for field in element: + print(field.tag, field.attrib) + #cleaned element is a tree - cleaned_element = parsers.INLXmlParser(element).clearxml() + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() entity = inl_factory.get_entity(cleaned_element) #test print the entity - entity.print_entity() + if entity != None: + entity.print_entity() #TODO analys and upload the entity @@ -43,4 +49,4 @@ def read_file(path, element_key): if __name__ == '__main__': - read_file(r"../../NLI-nnl10.xml", 'record') + read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') -- cgit v1.2.3