Use UNIX line endings

author: Tzafrir Cohen <tzafrir@cohens.org.il> 2016-09-25 20:28:40 +0300
committer: Tzafrir Cohen <tzafrir@cohens.org.il> 2016-09-25 20:28:40 +0300
commit: fa107061d3d8f3decf12120ba2f4ebed1e78218a (patch)
tree: b07d2a56f0b9c0d606336bf51f4e1126bee02206
parent: 6503c222d1c2145f32d9678157a9659b70a12c83 (diff)
15 files changed, 389 insertions, 389 deletions
diff --git a/entities/__init__.py b/entities/__init__.py
index 701846e..907ef4d 100644
--- a/entities/__init__.py
+++ b/entities/__init__.py
@@ -1,3 +1,3 @@
-from entities.person import Person
-from entities.institution import Institution
+from entities.person import Person
+from entities.institution import Institution
 from entities.location import Location
 \ No newline at end of file
diff --git a/entities/basic_entity.py b/entities/basic_entity.py
index 9e8f11b..9181422 100644
--- a/entities/basic_entity.py
+++ b/entities/basic_entity.py
@@ -1,5 +1,5 @@
-from libs import JsonSerializable
-
-
-class BasicEntity(JsonSerializable):
-    pass
+from libs import JsonSerializable
+
+
+class BasicEntity(JsonSerializable):
+    pass
diff --git a/entities/institution.py b/entities/institution.py
index 6be86fc..4538207 100644
--- a/entities/institution.py
+++ b/entities/institution.py
@@ -1,6 +1,6 @@
-from entities.basic_entity import BasicEntity
-
-
-class Institution(BasicEntity):
-    def __init__(self):
-        raise NotImplementedError()
+from entities.basic_entity import BasicEntity
+
+
+class Institution(BasicEntity):
+    def __init__(self):
+        raise NotImplementedError()
diff --git a/entities/location.py b/entities/location.py
index f782e1f..a43eb8d 100644
--- a/entities/location.py
+++ b/entities/location.py
@@ -1,25 +1,25 @@
-import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Location(BasicEntity):
-    def __init__(self, name, types_of_place, name_in_langs, comments_list):
-        self.name = name
-        self.types_of_place = types_of_place
-        self.name_in_langs = name_in_langs
-        self.comments_list = comments_list
-
-    CSV_FIELDS = ["name", "comments"]
-    TYPE = "LOCATION"
-
-
-    def print_entity(self):
-        print("Name = " + self.name)
-        print("Name in langs = " + str(self.name_in_langs))
-        print("Types = " + str(self.types_of_place))
-        print("Comments = " + str(self.comments_list))
-
-    def to_csv_dict(self):
-        return {'name': self.name,
-                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Location(BasicEntity):
+    def __init__(self, name, types_of_place, name_in_langs, comments_list):
+        self.name = name
+        self.types_of_place = types_of_place
+        self.name_in_langs = name_in_langs
+        self.comments_list = comments_list
+
+    CSV_FIELDS = ["name", "comments"]
+    TYPE = "LOCATION"
+
+
+    def print_entity(self):
+        print("Name = " + self.name)
+        print("Name in langs = " + str(self.name_in_langs))
+        print("Types = " + str(self.types_of_place))
+        print("Comments = " + str(self.comments_list))
+
+    def to_csv_dict(self):
+        return {'name': self.name,
+                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/entities/person.py b/entities/person.py
index a5aa396..b315aac 100644
--- a/entities/person.py
+++ b/entities/person.py
@@ -1,76 +1,76 @@
-import json
-
-from entities.basic_entity import BasicEntity
-
-
-class Person(BasicEntity):
-    def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
-        """
-
-        :param name:
-        :param date_of_birth:
-        :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
-            {
-                "latin": "George"
-                "heb": "[george in hebrew]"
-            }
-        """
-        self.name = name
-        years_parts = date_of_birth.split('-')
-        if (len(years_parts) == 2):
-            self.birth_year = years_parts[0]
-            self.death_year = years_parts[1]
-        else:
-            self.birth_year = date_of_birth.strip()
-            self.death_year = ''
-        self.name_in_langs = name_in_langs
-        '''
-        place_of_birth = list()
-        place_of_death = list()
-        profession = list()
-        for comment in bio_data:
-            encoded_comment = ''.join(comment).strip()
-            if encoded_comment.startswith(u"מקום לידה: "):
-                place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
-            if encoded_comment.startswith(u"מקום פטירה: "):
-                place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
-            if encoded_comment.startswith(u"מקצוע: "):
-                profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
-
-        self.place_of_birth = place_of_birth
-        self.place_of_death = place_of_death
-        self.profession = profession
-        '''
-        bio_data_dict = dict()
-        for elem in bio_data:
-            elem_splitted = elem.split(":")
-            if len(elem_splitted) == 2:
-                bio_data_key = elem_splitted[0]
-                bio_data_value = elem_splitted[1]
-                if bio_data_key in bio_data_dict:
-                    bio_data_dict.get(bio_data_key).append(bio_data_value)
-                else:
-                    bio_data_dict.update(
-                        {bio_data_key: [bio_data_value]}
-                    )
-            else:
-                bio_data_dict.update({elem: ''})
-        self.bio_data = bio_data_dict
-        self.comments_list = comments_list
-        self.profession = profession
-
-    CSV_FIELDS = ["name", "biodata", "comments"]
-    TYPE = 'PERSON'
-
-    def print_entity(self):
-        print("Name = " + self.name)
-        print("Birth year = " + self.birth_year)
-        print("Death year = " + self.death_year)
-        print("Names in langs = " + str(self.name_in_langs))
-        print("Bio Data = " + str(self.bio_data))
-        print("Comments = " + str(self.comments_list))
-        print("Profession = " + str(self.profession))
-
-    def to_csv_dict(self):
-        return {'name': self.name, 'biodata': self.bio_data,
-                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
+import json
+
+from entities.basic_entity import BasicEntity
+
+
+class Person(BasicEntity):
+    def __init__(self, name, date_of_birth, name_in_langs, bio_data, comments_list, profession):
+        """
+
+        :param name:
+        :param date_of_birth:
+        :param name_in_langs: Mapping of the persons's name in various languages, as a dictionary. For example:
+            {
+                "latin": "George"
+                "heb": "[george in hebrew]"
+            }
+        """
+        self.name = name
+        years_parts = date_of_birth.split('-')
+        if (len(years_parts) == 2):
+            self.birth_year = years_parts[0]
+            self.death_year = years_parts[1]
+        else:
+            self.birth_year = date_of_birth.strip()
+            self.death_year = ''
+        self.name_in_langs = name_in_langs
+        '''
+        place_of_birth = list()
+        place_of_death = list()
+        profession = list()
+        for comment in bio_data:
+            encoded_comment = ''.join(comment).strip()
+            if encoded_comment.startswith(u"מקום לידה: "):
+                place_of_birth.append(encoded_comment.partition(u"מקום לידה: ")[2])
+            if encoded_comment.startswith(u"מקום פטירה: "):
+                place_of_death.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+            if encoded_comment.startswith(u"מקצוע: "):
+                profession.append(encoded_comment.partition(u"מקום פטירה: ")[2])
+
+        self.place_of_birth = place_of_birth
+        self.place_of_death = place_of_death
+        self.profession = profession
+        '''
+        bio_data_dict = dict()
+        for elem in bio_data:
+            elem_splitted = elem.split(":")
+            if len(elem_splitted) == 2:
+                bio_data_key = elem_splitted[0]
+                bio_data_value = elem_splitted[1]
+                if bio_data_key in bio_data_dict:
+                    bio_data_dict.get(bio_data_key).append(bio_data_value)
+                else:
+                    bio_data_dict.update(
+                        {bio_data_key: [bio_data_value]}
+                    )
+            else:
+                bio_data_dict.update({elem: ''})
+        self.bio_data = bio_data_dict
+        self.comments_list = comments_list
+        self.profession = profession
+
+    CSV_FIELDS = ["name", "biodata", "comments"]
+    TYPE = 'PERSON'
+
+    def print_entity(self):
+        print("Name = " + self.name)
+        print("Birth year = " + self.birth_year)
+        print("Death year = " + self.death_year)
+        print("Names in langs = " + str(self.name_in_langs))
+        print("Bio Data = " + str(self.bio_data))
+        print("Comments = " + str(self.comments_list))
+        print("Profession = " + str(self.profession))
+
+    def to_csv_dict(self):
+        return {'name': self.name, 'biodata': self.bio_data,
+                'comments': json.dumps(self.comments_list, ensure_ascii=False)}
diff --git a/factories/INL_factory.py b/factories/INL_factory.py
index 286762a..f4e494f 100644
--- a/factories/INL_factory.py
+++ b/factories/INL_factory.py
@@ -1,130 +1,130 @@
-import entities
-from factories import BasicFactory
-import xml.etree.cElementTree as ET
-
-TAG_TO_ENTITY_MAPPING = {
-    '100': entities.Person,
-    '110': entities.Institution,
-    '151': entities.Location
-}
-
-
-ENTITY_KEYS = {
-    '100.a': 'name',
-    '100.9': 'name_langindic',
-    '100.d': 'date_of_birth',
-    '400.a': 'name_in_langs',
-    '400.9': 'langs_langindic',
-    '678.a': 'bio_data',
-    '151.a': 'name',
-    '151.9': 'name_langindic',
-    '451:a': 'name_in_langs',
-    '451:9': 'langs_langindic',
-    '550.a': 'type_of_place',
-    '667.a': 'comment',
-    '374.a': 'profession'
-}
-
-
-def get_record_key(record):
-    root = record.getroot()
-    for field in root:
-        field_tag = field.attrib.get('tag')
-        if '100' in field_tag:
-            return '100'
-        if '151' in field_tag:
-            return '151'
-        if '110' in field_tag:
-            return '110'
-
-class INLFactory(BasicFactory):
-    def __init__(self, tag_to_entity_mapping=None):
-        self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
-
-    def get_entity(self,  raw_object, entity_keys=ENTITY_KEYS):
-        record_key = get_record_key(raw_object)
-        #100 is person
-        if record_key == '100':
-            name = ''
-            name_in_langs = dict()
-            bio_data = list()
-            comment_list = list()
-            eng_name = ''
-            date_of_birth = ''
-            profession = list()
-            name_diff = ''
-            #get the names and date of birth and bio data
-            for field in raw_object.getroot():
-                key = field.attrib.get('tag')
-                tag = entity_keys.get(key)
-                if tag == 'name':
-                    name = field.text
-                elif tag == 'name_langindic':
-                    # chack if this english name
-                    if field.text == 'lat':
-                        eng_name = name
-                    # else add it to name_in_langs
-                    else:
-                        if field.text in name_in_langs:
-                            name_in_langs.get(field.text).append(name)
-                        else:
-                            name_in_langs.update({field.text: [name]})
-                elif tag == 'date_of_birth':
-                    date_of_birth = field.text
-                elif tag == 'name_in_langs':
-                    name_diff = field.text
-                elif tag == 'langs_langindic':
-                    if field.text in name_in_langs:
-                        name_in_langs.get(field.text).append(name_diff)
-                    else:
-                        name_in_langs.update({field.text: [name_diff]})
-                elif tag == 'bio_data':
-                    bio_data.append(field.text)
-                elif tag == 'comment':
-                    comment_list.append(field.text)
-                elif tag == 'profession':
-                    profession.append(field.text)
-            return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
-        #110 is institue
-        elif record_key == '110':
-            return entities.Institution()
-        #151 is location
-        elif record_key == '151':
-            name_in_langs = dict()
-            types_of_place = list()
-            comment_list = list()
-            eng_name = ''
-            name_diff = ''
-
-            for field in raw_object.getroot():
-                key = field.attrib.get('tag')
-                tag = entity_keys.get(key)
-                if tag == 'name':
-                    name = field.text
-                elif tag == 'name_langindic':
-                    # chack if this english name
-                    if field.text == 'lat':
-                        eng_name = name
-                    # else add it to name_in_langs
-                    else:
-                        if field.text in name_in_langs:
-                            name_in_langs.get(field.text).append(name)
-                        else:
-                            name_in_langs.update({field.text: [name]})
-                elif tag == 'type_of_place':
-                    types_of_place.append(field.text)
-                elif tag == 'name_in_langs':
-                    name_diff = field.text
-                elif tag == 'langs_langindic':
-                    if field.text in name_in_langs:
-                        name_in_langs.get(field.text).append(name_diff)
-                    else:
-                        name_in_langs.update({field.text: [name_diff]})
-                elif tag == 'comment':
-                    comment_list.append(field.text)
-            return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
-        else:
-            return None
-        #    raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
-
-
+import entities
+from factories import BasicFactory
+import xml.etree.cElementTree as ET
+
+TAG_TO_ENTITY_MAPPING = {
+    '100': entities.Person,
+    '110': entities.Institution,
+    '151': entities.Location
+}
+
+
+ENTITY_KEYS = {
+    '100.a': 'name',
+    '100.9': 'name_langindic',
+    '100.d': 'date_of_birth',
+    '400.a': 'name_in_langs',
+    '400.9': 'langs_langindic',
+    '678.a': 'bio_data',
+    '151.a': 'name',
+    '151.9': 'name_langindic',
+    '451:a': 'name_in_langs',
+    '451:9': 'langs_langindic',
+    '550.a': 'type_of_place',
+    '667.a': 'comment',
+    '374.a': 'profession'
+}
+
+
+def get_record_key(record):
+    root = record.getroot()
+    for field in root:
+        field_tag = field.attrib.get('tag')
+        if '100' in field_tag:
+            return '100'
+        if '151' in field_tag:
+            return '151'
+        if '110' in field_tag:
+            return '110'
+
+class INLFactory(BasicFactory):
+    def __init__(self, tag_to_entity_mapping=None):
+        self.mapping = tag_to_entity_mapping or TAG_TO_ENTITY_MAPPING
+
+    def get_entity(self,  raw_object, entity_keys=ENTITY_KEYS):
+        record_key = get_record_key(raw_object)
+        #100 is person
+        if record_key == '100':
+            name = ''
+            name_in_langs = dict()
+            bio_data = list()
+            comment_list = list()
+            eng_name = ''
+            date_of_birth = ''
+            profession = list()
+            name_diff = ''
+            #get the names and date of birth and bio data
+            for field in raw_object.getroot():
+                key = field.attrib.get('tag')
+                tag = entity_keys.get(key)
+                if tag == 'name':
+                    name = field.text
+                elif tag == 'name_langindic':
+                    # chack if this english name
+                    if field.text == 'lat':
+                        eng_name = name
+                    # else add it to name_in_langs
+                    else:
+                        if field.text in name_in_langs:
+                            name_in_langs.get(field.text).append(name)
+                        else:
+                            name_in_langs.update({field.text: [name]})
+                elif tag == 'date_of_birth':
+                    date_of_birth = field.text
+                elif tag == 'name_in_langs':
+                    name_diff = field.text
+                elif tag == 'langs_langindic':
+                    if field.text in name_in_langs:
+                        name_in_langs.get(field.text).append(name_diff)
+                    else:
+                        name_in_langs.update({field.text: [name_diff]})
+                elif tag == 'bio_data':
+                    bio_data.append(field.text)
+                elif tag == 'comment':
+                    comment_list.append(field.text)
+                elif tag == 'profession':
+                    profession.append(field.text)
+            return entities.Person(eng_name, date_of_birth, name_in_langs, bio_data, comment_list, profession)
+        #110 is institue
+        elif record_key == '110':
+            return entities.Institution()
+        #151 is location
+        elif record_key == '151':
+            name_in_langs = dict()
+            types_of_place = list()
+            comment_list = list()
+            eng_name = ''
+            name_diff = ''
+
+            for field in raw_object.getroot():
+                key = field.attrib.get('tag')
+                tag = entity_keys.get(key)
+                if tag == 'name':
+                    name = field.text
+                elif tag == 'name_langindic':
+                    # chack if this english name
+                    if field.text == 'lat':
+                        eng_name = name
+                    # else add it to name_in_langs
+                    else:
+                        if field.text in name_in_langs:
+                            name_in_langs.get(field.text).append(name)
+                        else:
+                            name_in_langs.update({field.text: [name]})
+                elif tag == 'type_of_place':
+                    types_of_place.append(field.text)
+                elif tag == 'name_in_langs':
+                    name_diff = field.text
+                elif tag == 'langs_langindic':
+                    if field.text in name_in_langs:
+                        name_in_langs.get(field.text).append(name_diff)
+                    else:
+                        name_in_langs.update({field.text: [name_diff]})
+                elif tag == 'comment':
+                    comment_list.append(field.text)
+            return entities.Location(eng_name, types_of_place , name_in_langs, comment_list)
+        else:
+            return None
+        #    raise KeyError('Key {} was not recognized for factory {}'.format(entity_keys, type(self)))
+
+
diff --git a/factories/__init__.py b/factories/__init__.py
index 947845c..86901f5 100644
--- a/factories/__init__.py
+++ b/factories/__init__.py
@@ -1,2 +1,2 @@
-from factories.basic_factory import BasicFactory
+from factories.basic_factory import BasicFactory
 from factories.INL_factory import INLFactory
 \ No newline at end of file
diff --git a/factories/basic_factory.py b/factories/basic_factory.py
index 1974d65..1715846 100644
--- a/factories/basic_factory.py
+++ b/factories/basic_factory.py
@@ -1,3 +1,3 @@
-class BasicFactory(object):
-    def get_entity(self, entity_key, raw_object):
-        raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
+class BasicFactory(object):
+    def get_entity(self, entity_key, raw_object):
+        raise NotImplementedError("get_entity() method must be implemented class {}".format(type(self)))
diff --git a/libs/json_tools.py b/libs/json_tools.py
index 9ce19b0..5e78d23 100644
--- a/libs/json_tools.py
+++ b/libs/json_tools.py
@@ -1,9 +1,9 @@
-import json
-
-
-class JsonSerializable(object):
-    def __repr__(self):
-        return str(self.to_json())
-
-    def to_json(self):
-        return json.dumps(self.__dict__, ensure_ascii=False)
+import json
+
+
+class JsonSerializable(object):
+    def __repr__(self):
+        return str(self.to_json())
+
+    def to_json(self):
+        return json.dumps(self.__dict__, ensure_ascii=False)
diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py
index cdde5a8..1a06f6b 100644
--- a/parsers/INL_xml_parser.py
+++ b/parsers/INL_xml_parser.py
@@ -1,36 +1,36 @@
-try:
-    import xml.etree.cElementTree as ET
-except ImportError:
-    import xml.etree.ElementTree as ET
-
-KNOWN_FIELD_TAGS = ['100', '110', '151']
-
-TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
-
-class INLXmlParser:
-    def __init__(self, reader, whitelist=TAG_WHITELIST):
-        self.reader = reader
-        #self.whitelist = whitelist or KNOWN_FIELD_TAGS
-        self.whitelist = whitelist
-
-    def clearxml(self):
-
-        # # scan the datafields in the records and copy to the new one only the tags in the whitelist
-        # for record in root:    # create new record
-        newRecord = ET.Element('record')
-        for field in self.reader:
-            fieldtag = field.attrib.get('tag')
-            if fieldtag in self.whitelist:
-                temptag = fieldtag
-                # tag 700 and 400 are the same
-                if temptag == '700':
-                    temptag = '400'
-                for data in field:
-                    newFieldTag = temptag
-                    newFieldTag += '.'
-                    newFieldTag += data.attrib.get('code')
-                    newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
-                    newTag.text = data.text
-
-        newRecordTree = ET.ElementTree(newRecord)
-        return ET.ElementTree(newRecord)
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+KNOWN_FIELD_TAGS = ['100', '110', '151']
+
+TAG_WHITELIST = ['100', '400', '700', '678', '667', '151', '550', '451', '374']
+
+class INLXmlParser:
+    def __init__(self, reader, whitelist=TAG_WHITELIST):
+        self.reader = reader
+        #self.whitelist = whitelist or KNOWN_FIELD_TAGS
+        self.whitelist = whitelist
+
+    def clearxml(self):
+
+        # # scan the datafields in the records and copy to the new one only the tags in the whitelist
+        # for record in root:    # create new record
+        newRecord = ET.Element('record')
+        for field in self.reader:
+            fieldtag = field.attrib.get('tag')
+            if fieldtag in self.whitelist:
+                temptag = fieldtag
+                # tag 700 and 400 are the same
+                if temptag == '700':
+                    temptag = '400'
+                for data in field:
+                    newFieldTag = temptag
+                    newFieldTag += '.'
+                    newFieldTag += data.attrib.get('code')
+                    newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag})
+                    newTag.text = data.text
+
+        newRecordTree = ET.ElementTree(newRecord)
+        return ET.ElementTree(newRecord)
diff --git a/parsers/__init__.py b/parsers/__init__.py
index 07907f9..d32c917 100644
--- a/parsers/__init__.py
+++ b/parsers/__init__.py
@@ -1,2 +1,2 @@
-
+
 from .INL_xml_parser import INLXmlParser
 \ No newline at end of file
diff --git a/parsers/basic_parser.py b/parsers/basic_parser.py
index 32c1b43..dae19cb 100644
--- a/parsers/basic_parser.py
+++ b/parsers/basic_parser.py
@@ -1,6 +1,6 @@
-class BasicParser(object):
-    def __init__(self):
-        pass
-
-    def parse(self, data):
-        raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
+class BasicParser(object):
+    def __init__(self):
+        pass
+
+    def parse(self, data):
+        raise NotImplementedError("parse() method must be implemented class {}".format(type(self)))
diff --git a/readers/xml_reader.py b/readers/xml_reader.py
index 710899d..5b2d1fd 100644
--- a/readers/xml_reader.py
+++ b/readers/xml_reader.py
@@ -1,61 +1,61 @@
-#  from __future__ import absolute_import
-import json
-import csv
-import parsers, factories
-from entities import Person
-
-try:
-    import xml.etree.cElementTree as ET
-except ImportError:
-    import xml.etree.ElementTree as ET
-
-def read_file(path, element_key):
-    # get an iterable
-    record_counter = 0
-    context = ET.iterparse(path, events=("start", "end"))
-
-    # turn it into an iterator
-    context = iter(context)
-
-    # get the root element
-    event, root = context.__next__()
-
-    # the factory
-    inl_factory = factories.INLFactory()
-    files = {}
-    for event, element in context:
-        if 'end' in event:
-            if element_key in element.tag:
-                # enter the processing here
-                record_counter += 1
-
-                #cleaned element is a tree
-                inl_parser = parsers.INLXmlParser(element)
-                cleaned_element = inl_parser.clearxml()
-                entity = inl_factory.get_entity(cleaned_element)
-
-                # test print the entity
-                if entity != None:
-                    if entity.TYPE not in files:
-                        files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
-                    json_entity = entity.to_json()
-                    print(json_entity)
-                    writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
-                    writer.writerow(entity.to_csv_dict())
-                    # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
-                    # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
-
-                    # entity.print_entity()
-
-                # TODO analys and upload the entity
-
-
-                # import pdb; pdb.set_trace()
-                print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
-                      cleaned_element.getroot().text)
-                element.clear()
-    print(record_counter)
-
-
-if __name__ == '__main__':
-    read_file(r"../../NLI-nnl10.xml", 'record')
+#  from __future__ import absolute_import
+import json
+import csv
+import parsers, factories
+from entities import Person
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+def read_file(path, element_key):
+    # get an iterable
+    record_counter = 0
+    context = ET.iterparse(path, events=("start", "end"))
+
+    # turn it into an iterator
+    context = iter(context)
+
+    # get the root element
+    event, root = context.__next__()
+
+    # the factory
+    inl_factory = factories.INLFactory()
+    files = {}
+    for event, element in context:
+        if 'end' in event:
+            if element_key in element.tag:
+                # enter the processing here
+                record_counter += 1
+
+                #cleaned element is a tree
+                inl_parser = parsers.INLXmlParser(element)
+                cleaned_element = inl_parser.clearxml()
+                entity = inl_factory.get_entity(cleaned_element)
+
+                # test print the entity
+                if entity != None:
+                    if entity.TYPE not in files:
+                        files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8')
+                    json_entity = entity.to_json()
+                    print(json_entity)
+                    writer = csv.DictWriter(files[entity.TYPE], entity.CSV_FIELDS)
+                    writer.writerow(entity.to_csv_dict())
+                    # json.dump(entity.comments_list, f667, indent=2, ensure_ascii=False)
+                    # json.dump(entity.bio_data, f678, indent=2, ensure_ascii=False)
+
+                    # entity.print_entity()
+
+                # TODO analys and upload the entity
+
+
+                # import pdb; pdb.set_trace()
+                print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@',
+                      cleaned_element.getroot().text)
+                element.clear()
+    print(record_counter)
+
+
+if __name__ == '__main__':
+    read_file(r"../../NLI-nnl10.xml", 'record')
diff --git a/testers/factorytester.py b/testers/factorytester.py
index 55ebe7c..88e660d 100644
--- a/testers/factorytester.py
+++ b/testers/factorytester.py
@@ -1,21 +1,21 @@
-from __future__ import absolute_import
-import parsers
-import factories
-import xml.etree.cElementTree as ET
-
-xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
-whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
-
-
-xmltree = ET.parse(xmlpath)
-entities = list()
-inl_factory = factories.INLFactory()
-
-for record in xmltree.getroot():
-    inl_parser = parsers.INLXmlParser(record, whitelist)
-    clean_record = inl_parser.clearxml()
-    entities.append(inl_factory.get_entity(clean_record))
-
-for entity in entities:
-    print(entity)
-
+from __future__ import absolute_import
+import parsers
+import factories
+import xml.etree.cElementTree as ET
+
+xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml"
+whitelist = ['100', '374', '400', '151', '451', '550', '551', '678']
+
+
+xmltree = ET.parse(xmlpath)
+entities = list()
+inl_factory = factories.INLFactory()
+
+for record in xmltree.getroot():
+    inl_parser = parsers.INLXmlParser(record, whitelist)
+    clean_record = inl_parser.clearxml()
+    entities.append(inl_factory.get_entity(clean_record))
+
+for entity in entities:
+    print(entity)
+
diff --git a/writers/wd_writer.py b/writers/wd_writer.py
index 4a456e5..a9a3e42 100644
--- a/writers/wd_writer.py
+++ b/writers/wd_writer.py
@@ -1,7 +1,7 @@
-import pywikibot
-from pywikibot import pagegenerators, WikidataBot
-
-
-class WDWriter(object):
-    def __init__(self, source_path, reader, factory):
-        self.source_path = source_path
+import pywikibot
+from pywikibot import pagegenerators, WikidataBot
+
+
+class WDWriter(object):
+    def __init__(self, source_path, reader, factory):
+        self.source_path = source_path
author	Tzafrir Cohen <tzafrir@cohens.org.il>	2016-09-25 20:28:40 +0300
committer	Tzafrir Cohen <tzafrir@cohens.org.il>	2016-09-25 20:28:40 +0300
commit	fa107061d3d8f3decf12120ba2f4ebed1e78218a (patch)
tree	b07d2a56f0b9c0d606336bf51f4e1126bee02206
parent	6503c222d1c2145f32d9678157a9659b70a12c83 (diff)