summaryrefslogtreecommitdiff
path: root/readers/xml_reader.py
blob: 3e630cbcf040ac5712097fbba3edbd04907e8257 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#  from __future__ import absolute_import
import parsers, factories

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET


def read_file(path, element_key):
    # get an iterable
    record_counter = 0
    context = ET.iterparse(path, events=("start", "end"))

    # turn it into an iterator
    context = iter(context)

    # get the root element
    event, root = context.__next__()

    #the factory
    inl_factory = factories.INLFactory()

    for event, element in context:
        if 'end' in event:
            if element_key in element.tag:
                #enter the processing here
                record_counter += 1
                #cleaned element is a tree
                cleaned_element = parsers.INLXmlParser(element).clearxml()
                entity = inl_factory.get_entity(cleaned_element)

                #test print the entity
                entity.print_entity()


                #TODO analys and upload the entity


                # import pdb; pdb.set_trace()
                print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text)
            element.clear()


if __name__ == '__main__':
    read_file(r"../../NLI-nnl10.xml", 'record')