diff options
author | Tzafrir Cohen <tzafrir@cohens.org.il> | 2016-09-25 20:28:40 +0300 |
---|---|---|
committer | Tzafrir Cohen <tzafrir@cohens.org.il> | 2016-09-25 20:28:40 +0300 |
commit | fa107061d3d8f3decf12120ba2f4ebed1e78218a (patch) | |
tree | b07d2a56f0b9c0d606336bf51f4e1126bee02206 | |
parent | 6503c222d1c2145f32d9678157a9659b70a12c83 (diff) |
Use UNIX line endings
-rw-r--r-- | entities/__init__.py | 4 | ||||
-rw-r--r-- | entities/basic_entity.py | 10 | ||||
-rw-r--r-- | entities/institution.py | 12 | ||||
-rw-r--r-- | entities/location.py | 50 | ||||
-rw-r--r-- | entities/person.py | 152 | ||||
-rw-r--r-- | factories/INL_factory.py | 260 | ||||
-rw-r--r-- | factories/__init__.py | 2 | ||||
-rw-r--r-- | factories/basic_factory.py | 6 | ||||
-rw-r--r-- | libs/json_tools.py | 18 | ||||
-rw-r--r-- | parsers/INL_xml_parser.py | 72 | ||||
-rw-r--r-- | parsers/__init__.py | 2 | ||||
-rw-r--r-- | parsers/basic_parser.py | 12 | ||||
-rw-r--r-- | readers/xml_reader.py | 122 | ||||
-rw-r--r-- | testers/factorytester.py | 42 | ||||
-rw-r--r-- | writers/wd_writer.py | 14 |
15 files changed, 389 insertions, 389 deletions
diff --git a/entities/__init__.py b/entities/__init__.py index 701846e..907ef4d 100644 --- a/entities/__init__.py +++ b/entities/__init__.py @@ -1,3 +1,3 @@ -from entities.person import Person
-from entities.institution import Institution
+from entities.person import Person +from entities.institution import Institution from entities.location import Location
\ No newline at end of file diff --git a/entities/basic_entity.py b/entities/basic_entity.py index 9e8f11b..9181422 100644 --- a/entities/basic_entity.py +++ b/entities/basic_entity.py @@ -1,5 +1,5 @@ -from libs import JsonSerializable
-
-
-class BasicEntity(JsonSerializable):
- pass
+from libs import JsonSerializable + + +class BasicEntity(JsonSerializable): + pass diff --git a/entities/institution.py b/entities/institution.py index 6be86fc..4538207 100644 --- a/entities/institution.py +++ b/entities/institution.py @@ -1,6 +1,6 @@ -from entities.basic_entity import BasicEntity
-
-
-class Institution(BasicEntity):
- def __init__(self):
- raise NotImplementedError()
+from entities.basic_entity import BasicEntity + + +class Institution(BasicEntity): + def __init__(self): + raise NotImplementedError() diff --git a/entities/location.py b/entities/location.py index f782e1f..a43eb8d 100644 --- a/entities/location.py +++ b/entities/location.py @@ -1,25 +1,25 @@ -import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Location(BasicEntity):
- def __init__(self, name, types_of_place, name_in_langs, comments_list):
- self.name = name
- self.types_of_place = types_of_place
- self.name_in_langs = name_in_langs
- self.comments_list = comments_list
-
- CSV_FIELDS = ["name", "comments"]
- TYPE = "LOCATION"
-
-
- def print_entity(self):
- print("Name = " + self.name)
- print("Name in langs = " + str(self.name_in_langs))
- print("Types = " + str(self.types_of_place))
- print("Comments = " + str(self.comments_list))
-
- def to_csv_dict(self):
- return {'name': self.name,
- 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json + +from entities.basic_entity import BasicEntity + + +class Location(BasicEntity): + def __init__(self, name, types_of_place, name_in_langs, comments_list): + self.name = name + self.types_of_place = types_of_place + self.name_in_langs = name_in_langs + self.comments_list = comments_list + + CSV_FIELDS = ["name", "comments"] + TYPE = "LOCATION" + + + def print_entity(self): + print("Name = " + self.name) + print("Name in langs = " + str(self.name_in_langs)) + print("Types = " + str(self.types_of_place)) + print("Comments = " + str(self.comments_list)) + + def to_csv_dict(self): + return {'name': self.name, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/entities/person.py b/entities/person.py index a5aa396..b315aac 100644 --- a/entities/person.py +++ b/entities/person.py @@ -1,76 +1,76 @@ -import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Person(BasicEntity):
- def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
- """
-
- :param name:
- :param date_of_birth:
- :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
- {
- "latin": "George"
- "heb": "[george in hebrew]"
- }
- """
- self.name = name
- years_parts = date_of_birth.split('-')
- if (len(years_parts) == 2):
- self.birth_year = years_parts[0]
- self.death_year = years_parts[1]
- else:
- self.birth_year = date_of_birth.strip()
- self.death_year = ''
- self.name_in_langs = name_in_langs
- '''
- place_of_birth = list()
- place_of_death = list()
- profession = list()
- for comment in bio_data:
- encoded_comment = ''.join(comment).strip()
- if encoded_comment.startswith(u"מקום לידה: "):
- place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
- if encoded_comment.startswith(u"מקום פטירה: "):
- place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
- if encoded_comment.startswith(u"מקצוע: "):
- profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
-
- self.place_of_birth = place_of_birth
- self.place_of_death = place_of_death
- self.profession = profession
- '''
- bio_data_dict = dict()
- for elem in bio_data:
- elem_splitted = elem.split(":")
- if len(elem_splitted) == 2:
- bio_data_key = elem_splitted[0]
- bio_data_value = elem_splitted[1]
- if bio_data_key in bio_data_dict:
- bio_data_dict.get(bio_data_key).append(bio_data_value)
- else:
- bio_data_dict.update(
- {bio_data_key: [bio_data_value]}
- )
- else:
- bio_data_dict.update({elem: ''})
- self.bio_data = bio_data_dict
- self.comments_list = comments_list
- self.profession = profession
-
- CSV_FIELDS = ["name", "biodata", "comments"]
- TYPE = 'PERSON'
-
- def print_entity(self):
- print("Name = " + self.name)
- print("Birth year = " + self.birth_year)
- print("Death year = " + self.death_year)
- print("Names in langs = " + str(self.name_in_langs))
- print("Bio Data = " + str(self.bio_data))
- print("Comments = " + str(self.comments_list))
- print("Profession = " + str(self.profession))
-
- def to_csv_dict(self):
- return {'name': self.name, 'biodata': self.bio_data,
- 'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json + +from entities.basic_entity import BasicEntity + + +class Person(BasicEntity): + def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession): + """ + + :param name: + :param date_of_birth: + :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example: + { + "latin": "George" + "heb": "[george in hebrew]" + } + """ + self.name = name + years_parts = date_of_birth.split('-') + if (len(years_parts) == 2): + self.birth_year = years_parts[0] + self.death_year = years_parts[1] + else: + self.birth_year = date_of_birth.strip() + self.death_year = '' + self.name_in_langs = name_in_langs + ''' + place_of_birth = list() + place_of_death = list() + profession = list() + for comment in bio_data: + encoded_comment = ''.join(comment).strip() + if encoded_comment.startswith(u"מקום לידה: "): + place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2]) + if encoded_comment.startswith(u"מקום פטירה: "): + place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + if encoded_comment.startswith(u"מקצוע: "): + profession.append(encoded_comment.partition(u"מקום פטירה: ")[2]) + + self.place_of_birth = place_of_birth + self.place_of_death = place_of_death + self.profession = profession + ''' + bio_data_dict = dict() + for elem in bio_data: + elem_splitted = elem.split(":") + if len(elem_splitted) == 2: + bio_data_key = elem_splitted[0] + bio_data_value = elem_splitted[1] + if bio_data_key in bio_data_dict: + bio_data_dict.get(bio_data_key).append(bio_data_value) + else: + bio_data_dict.update( + {bio_data_key: [bio_data_value]} + ) + else: + bio_data_dict.update({elem: ''}) + self.bio_data = bio_data_dict + self.comments_list = comments_list + self.profession = profession + + CSV_FIELDS = ["name", "biodata", "comments"] + TYPE = 'PERSON' + + def print_entity(self): + print("Name = " + self.name) + print("Birth year = " + self.birth_year) + print("Death year = " + self.death_year) + print("Names in langs = " + str(self.name_in_langs)) + print("Bio Data = " + str(self.bio_data)) + print("Comments = " + str(self.comments_list)) + print("Profession = " + str(self.profession)) + + def to_csv_dict(self): + return {'name': self.name, 'biodata': self.bio_data, + 'comments': json.dumps(self.comments_list, ensure_ascii=False)} diff --git a/factories/INL_factory.py b/factories/INL_factory.py index 286762a..f4e494f 100644 --- a/factories/INL_factory.py +++ b/factories/INL_factory.py @@ -1,130 +1,130 @@ -import entities
-from factories import BasicFactory
-import xml.etree.cElementTree as ET
-
-TAG_TO_ENTITY_MAPPING = {
- '100': entities.Person,
- '110': entities.Institution,
- '151': entities.Location
-}
-
-
-ENTITY_KEYS = {
- '100.a': 'name',
- '100.9': 'name_langindic',
- '100.d': 'date_of_birth',
- '400.a': 'name_in_langs',
- '400.9': 'langs_langindic',
- '678.a': 'bio_data',
- '151.a': 'name',
- '151.9': 'name_langindic',
- '451:a': 'name_in_langs',
- '451:9': 'langs_langindic',
- '550.a': 'type_of_place',
- '667.a': 'comment',
- '374.a': 'profession'
-}
-
-
-def get_record_key(record):
- root = record.getroot()
- for field in root:
- field_tag = field.attrib.get('tag')
- if '100' in field_tag:
- return '100'
- if '151' in field_tag:
- return '151'
- if '110' in field_tag:
- return '110'
-
-class INLFactory(BasicFactory):
- def __init__(self, tag_to_entity_mapping=None):
- self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
-
- def get_entity(self, raw_object, entity_keys=ENTITY_KEYS):
- record_key = get_record_key(raw_object)
- #100 is person
- if record_key == '100':
- name = ''
- name_in_langs = dict()
- bio_data = list()
- comment_list = list()
- eng_name = ''
- date_of_birth = ''
- profession = list()
- name_diff = ''
- #get the names and date of birth and bio data
- for field in raw_object.getroot():
- key = field.attrib.get('tag')
- tag = entity_keys.get(key)
- if tag == 'name':
- name = field.text
- elif tag == 'name_langindic':
- # chack if this english name
- if field.text == 'lat':
- eng_name = name
- # else add it to name_in_langs
- else:
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name)
- else:
- name_in_langs.update({field.text: [name]})
- elif tag == 'date_of_birth':
- date_of_birth = field.text
- elif tag == 'name_in_langs':
- name_diff = field.text
- elif tag == 'langs_langindic':
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name_diff)
- else:
- name_in_langs.update({field.text: [name_diff]})
- elif tag == 'bio_data':
- bio_data.append(field.text)
- elif tag == 'comment':
- comment_list.append(field.text)
- elif tag == 'profession':
- profession.append(field.text)
- return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
- #110 is institue
- elif record_key == '110':
- return entities.Institution()
- #151 is location
- elif record_key == '151':
- name_in_langs = dict()
- types_of_place = list()
- comment_list = list()
- eng_name = ''
- name_diff = ''
-
- for field in raw_object.getroot():
- key = field.attrib.get('tag')
- tag = entity_keys.get(key)
- if tag == 'name':
- name = field.text
- elif tag == 'name_langindic':
- # chack if this english name
- if field.text == 'lat':
- eng_name = name
- # else add it to name_in_langs
- else:
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name)
- else:
- name_in_langs.update({field.text: [name]})
- elif tag == 'type_of_place':
- types_of_place.append(field.text)
- elif tag == 'name_in_langs':
- name_diff = field.text
- elif tag == 'langs_langindic':
- if field.text in name_in_langs:
- name_in_langs.get(field.text).append(name_diff)
- else:
- name_in_langs.update({field.text: [name_diff]})
- elif tag == 'comment':
- comment_list.append(field.text)
- return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
- else:
- return None
- # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
-
-
+import entities +from factories import BasicFactory +import xml.etree.cElementTree as ET + +TAG_TO_ENTITY_MAPPING = { + '100': entities.Person, + '110': entities.Institution, + '151': entities.Location +} + + +ENTITY_KEYS = { + '100.a': 'name', + '100.9': 'name_langindic', + '100.d': 'date_of_birth', + '400.a': 'name_in_langs', + '400.9': 'langs_langindic', + '678.a': 'bio_data', + '151.a': 'name', + '151.9': 'name_langindic', + '451:a': 'name_in_langs', + '451:9': 'langs_langindic', + '550.a': 'type_of_place', + '667.a': 'comment', + '374.a': 'profession' +} + + +def get_record_key(record): + root = record.getroot() + for field in root: + field_tag = field.attrib.get('tag') + if '100' in field_tag: + return '100' + if '151' in field_tag: + return '151' + if '110' in field_tag: + return '110' + +class INLFactory(BasicFactory): + def __init__(self, tag_to_entity_mapping=None): + self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING + + def get_entity(self, raw_object, entity_keys=ENTITY_KEYS): + record_key = get_record_key(raw_object) + #100 is person + if record_key == '100': + name = '' + name_in_langs = dict() + bio_data = list() + comment_list = list() + eng_name = '' + date_of_birth = '' + profession = list() + name_diff = '' + #get the names and date of birth and bio data + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'date_of_birth': + date_of_birth = field.text + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'bio_data': + bio_data.append(field.text) + elif tag == 'comment': + comment_list.append(field.text) + elif tag == 'profession': + profession.append(field.text) + return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession) + #110 is institue + elif record_key == '110': + return entities.Institution() + #151 is location + elif record_key == '151': + name_in_langs = dict() + types_of_place = list() + comment_list = list() + eng_name = '' + name_diff = '' + + for field in raw_object.getroot(): + key = field.attrib.get('tag') + tag = entity_keys.get(key) + if tag == 'name': + name = field.text + elif tag == 'name_langindic': + # chack if this english name + if field.text == 'lat': + eng_name = name + # else add it to name_in_langs + else: + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name) + else: + name_in_langs.update({field.text: [name]}) + elif tag == 'type_of_place': + types_of_place.append(field.text) + elif tag == 'name_in_langs': + name_diff = field.text + elif tag == 'langs_langindic': + if field.text in name_in_langs: + name_in_langs.get(field.text).append(name_diff) + else: + name_in_langs.update({field.text: [name_diff]}) + elif tag == 'comment': + comment_list.append(field.text) + return entities.Location(eng_name, types_of_place , name_in_langs, comment_list) + else: + return None + # raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self))) + + diff --git a/factories/__init__.py b/factories/__init__.py index 947845c..86901f5 100644 --- a/factories/__init__.py +++ b/factories/__init__.py @@ -1,2 +1,2 @@ -from factories.basic_factory import BasicFactory
+from factories.basic_factory import BasicFactory from factories.INL_factory import INLFactory
\ No newline at end of file diff --git a/factories/basic_factory.py b/factories/basic_factory.py index 1974d65..1715846 100644 --- a/factories/basic_factory.py +++ b/factories/basic_factory.py @@ -1,3 +1,3 @@ -class BasicFactory(object):
- def get_entity(self, entity_key, raw_object):
- raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
+class BasicFactory(object): + def get_entity(self, entity_key, raw_object): + raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self))) diff --git a/libs/json_tools.py b/libs/json_tools.py index 9ce19b0..5e78d23 100644 --- a/libs/json_tools.py +++ b/libs/json_tools.py @@ -1,9 +1,9 @@ -import json
-
-
-class JsonSerializable(object):
- def __repr__(self):
- return str(self.to_json())
-
- def to_json(self):
- return json.dumps(self.__dict__, ensure_ascii=False)
+import json + + +class JsonSerializable(object): + def __repr__(self): + return str(self.to_json()) + + def to_json(self): + return json.dumps(self.__dict__, ensure_ascii=False) diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index cdde5a8..1a06f6b 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -1,36 +1,36 @@ -try:
- import xml.etree.cElementTree as ET
-except ImportError:
- import xml.etree.ElementTree as ET
-
-KNOWN_FIELD_TAGS = ['100', '110', '151']
-
-TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
-
-class INLXmlParser:
- def __init__(self, reader, whitelist=TAG_WHITELIST):
- self.reader = reader
- #self.whitelist = whitelist or KNOWN_FIELD_TAGS
- self.whitelist = whitelist
-
- def clearxml(self):
-
- # # scan the datafields in the records and copy to the new one only the tags in the whitelist
- # for record in root: # create new record
- newRecord = ET.Element('record')
- for field in self.reader:
- fieldtag = field.attrib.get('tag')
- if fieldtag in self.whitelist:
- temptag = fieldtag
- # tag 700 and 400 are the same
- if temptag == '700':
- temptag = '400'
- for data in field:
- newFieldTag = temptag
- newFieldTag += '.'
- newFieldTag += data.attrib.get('code')
- newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
- newTag.text = data.text
-
- newRecordTree = ET.ElementTree(newRecord)
- return ET.ElementTree(newRecord)
+try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] + +TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374'] + +class INLXmlParser: + def __init__(self, reader, whitelist=TAG_WHITELIST): + self.reader = reader + #self.whitelist = whitelist or KNOWN_FIELD_TAGS + self.whitelist = whitelist + + def clearxml(self): + + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.Element('record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + temptag = fieldtag + # tag 700 and 400 are the same + if temptag == '700': + temptag = '400' + for data in field: + newFieldTag = temptag + newFieldTag += '.' + newFieldTag += data.attrib.get('code') + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + newTag.text = data.text + + newRecordTree = ET.ElementTree(newRecord) + return ET.ElementTree(newRecord) diff --git a/parsers/__init__.py b/parsers/__init__.py index 07907f9..d32c917 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -1,2 +1,2 @@ -
+ from .INL_xml_parser import INLXmlParser
\ No newline at end of file diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py index 32c1b43..dae19cb 100644 --- a/parsers/basic_parser.py +++ b/parsers/basic_parser.py @@ -1,6 +1,6 @@ -class BasicParser(object):
- def __init__(self):
- pass
-
- def parse(self, data):
- raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
+class BasicParser(object): + def __init__(self): + pass + + def parse(self, data): + raise NotImplementedError("parse() method must be implemented class {}".format(type(self))) diff --git a/readers/xml_reader.py b/readers/xml_reader.py index 710899d..5b2d1fd 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -1,61 +1,61 @@ -# from __future__ import absolute_import
-import json
-import csv
-import parsers, factories
-from entities import Person
-
-try:
- import xml.etree.cElementTree as ET
-except ImportError:
- import xml.etree.ElementTree as ET
-
-def read_file(path, element_key):
- # get an iterable
- record_counter = 0
- context = ET.iterparse(path, events=("start", "end"))
-
- # turn it into an iterator
- context = iter(context)
-
- # get the root element
- event, root = context.__next__()
-
- # the factory
- inl_factory = factories.INLFactory()
- files = {}
- for event, element in context:
- if 'end' in event:
- if element_key in element.tag:
- # enter the processing here
- record_counter += 1
-
- #cleaned element is a tree
- inl_parser = parsers.INLXmlParser(element)
- cleaned_element = inl_parser.clearxml()
- entity = inl_factory.get_entity(cleaned_element)
-
- # test print the entity
- if entity != None:
- if entity.TYPE not in files:
- files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
- json_entity = entity.to_json()
- print(json_entity)
- writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
- writer.writerow(entity.to_csv_dict())
- # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
- # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
-
- # entity.print_entity()
-
- # TODO analys and upload the entity
-
-
- # import pdb; pdb.set_trace()
- print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
- cleaned_element.getroot().text)
- element.clear()
- print(record_counter)
-
-
-if __name__ == '__main__':
- read_file(r"../../NLI-nnl10.xml", 'record')
+# from __future__ import absolute_import +import json +import csv +import parsers, factories +from entities import Person + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.__next__() + + # the factory + inl_factory = factories.INLFactory() + files = {} + for event, element in context: + if 'end' in event: + if element_key in element.tag: + # enter the processing here + record_counter += 1 + + #cleaned element is a tree + inl_parser = parsers.INLXmlParser(element) + cleaned_element = inl_parser.clearxml() + entity = inl_factory.get_entity(cleaned_element) + + # test print the entity + if entity != None: + if entity.TYPE not in files: + files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') + json_entity = entity.to_json() + print(json_entity) + writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS) + writer.writerow(entity.to_csv_dict()) + # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False) + # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False) + + # entity.print_entity() + + # TODO analys and upload the entity + + + # import pdb; pdb.set_trace() + print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', + cleaned_element.getroot().text) + element.clear() + print(record_counter) + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') diff --git a/testers/factorytester.py b/testers/factorytester.py index 55ebe7c..88e660d 100644 --- a/testers/factorytester.py +++ b/testers/factorytester.py @@ -1,21 +1,21 @@ -from __future__ import absolute_import
-import parsers
-import factories
-import xml.etree.cElementTree as ET
-
-xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
-whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
-
-
-xmltree = ET.parse(xmlpath)
-entities = list()
-inl_factory = factories.INLFactory()
-
-for record in xmltree.getroot():
- inl_parser = parsers.INLXmlParser(record, whitelist)
- clean_record = inl_parser.clearxml()
- entities.append(inl_factory.get_entity(clean_record))
-
-for entity in entities:
- print(entity)
-
+from __future__ import absolute_import +import parsers +import factories +import xml.etree.cElementTree as ET + +xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml" +whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] + + +xmltree = ET.parse(xmlpath) +entities = list() +inl_factory = factories.INLFactory() + +for record in xmltree.getroot(): + inl_parser = parsers.INLXmlParser(record, whitelist) + clean_record = inl_parser.clearxml() + entities.append(inl_factory.get_entity(clean_record)) + +for entity in entities: + print(entity) + diff --git a/writers/wd_writer.py b/writers/wd_writer.py index 4a456e5..a9a3e42 100644 --- a/writers/wd_writer.py +++ b/writers/wd_writer.py @@ -1,7 +1,7 @@ -import pywikibot
-from pywikibot import pagegenerators, WikidataBot
-
-
-class WDWriter(object):
- def __init__(self, source_path, reader, factory):
- self.source_path = source_path
+import pywikibot +from pywikibot import pagegenerators, WikidataBot + + +class WDWriter(object): + def __init__(self, source_path, reader, factory): + self.source_path = source_path |