From 41125eb195324d18d9c2c12aa12ecbf66dc5d495 Mon Sep 17 00:00:00 2001 From: roy lewin Date: Sun, 25 Sep 2016 19:15:46 +0300 Subject: WIP: merge changes not previously merged (Commit done by Tzafrir) --- .idea/misc.xml | 2 +- .idea/modules.xml | 2 +- .idea/parser.iml | 12 - .idea/workspace.xml | 652 +++++++---------------------------------------- readers/xml_reader.py | 17 +- testers/factorytester.py | 8 +- writers/__init__.py | 0 writers/wd_writer.py | 6 + 8 files changed, 109 insertions(+), 590 deletions(-) delete mode 100644 .idea/parser.iml create mode 100644 writers/__init__.py create mode 100644 writers/wd_writer.py diff --git a/.idea/misc.xml b/.idea/misc.xml index f778c9e..d3cc99c 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml index 405d108..9a7bd2d 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/parser.iml b/.idea/parser.iml deleted file mode 100644 index 6f63a63..0000000 --- a/.idea/parser.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 7a9b802..bbac7b5 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,12 +1,16 @@ - - + + + + + + + - - + @@ -19,8 +23,7 @@ - - + - + - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - - + @@ -146,36 +58,14 @@ - - - @@ -188,8 +78,8 @@ @@ -220,27 +110,41 @@ - - - - - + + + + + + + + @@ -250,15 +154,15 @@ - - - @@ -269,46 +173,10 @@ - + - - - - - project - - - - - @@ -517,136 +379,62 @@ - - - + + + + - - - - - - - - 1474481137431 + + 1474553647621 - - 1474490333649 - - - 1474498941858 - - - 1474534309648 - - - 1474534819985 - - - 1474537696879 - - - 1474537703873 - - - 1474539772357 - - - 1474542828452 - - - 1474543685903 - - - 1474545222842 - - - 1474545328764 - - - 1474551297626 - - - + - + - - - - - - + + + + + - - + + + + - @@ -661,301 +449,33 @@ - - - - - - - - - + - - - - - - - - - - - - - - - - - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + diff --git a/readers/xml_reader.py b/readers/xml_reader.py index af80e25..5b2d1fd 100644 --- a/readers/xml_reader.py +++ b/readers/xml_reader.py @@ -20,13 +20,13 @@ def read_file(path, element_key): # get the root element event, root = context.__next__() - #the factory + # the factory inl_factory = factories.INLFactory() files = {} for event, element in context: if 'end' in event: if element_key in element.tag: - #enter the processing here + # enter the processing here record_counter += 1 #cleaned element is a tree @@ -34,7 +34,7 @@ def read_file(path, element_key): cleaned_element = inl_parser.clearxml() entity = inl_factory.get_entity(cleaned_element) - #test print the entity + # test print the entity if entity != None: if entity.TYPE not in files: files[entity.TYPE] = open("../out/{}.csv".format(entity.TYPE), 'w+', encoding='utf8') @@ -47,12 +47,15 @@ def read_file(path, element_key): # entity.print_entity() - - #TODO analys and upload the entity + # TODO analys and upload the entity # import pdb; pdb.set_trace() - #print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', cleaned_element.getroot().text) + print(record_counter, cleaned_element.getroot().tag, '@@@', cleaned_element.getroot().attrib, '@@@', + cleaned_element.getroot().text) element.clear() + print(record_counter) + + if __name__ == '__main__': - read_file(r"C:/Users/Ilsar/Documents/datahack/NLI-nnl10.xml", 'record') + read_file(r"../../NLI-nnl10.xml", 'record') diff --git a/testers/factorytester.py b/testers/factorytester.py index b6029ca..88e660d 100644 --- a/testers/factorytester.py +++ b/testers/factorytester.py @@ -3,17 +3,19 @@ import parsers import factories import xml.etree.cElementTree as ET -xmlpath = 'C:/Users/Ilsar/Documents/datahack/xml_example.xml' +xmlpath = r"C:\roy\NLI-nnl10 - 1MB.xml" +whitelist = ['100', '374', '400', '151', '451', '550', '551', '678'] + xmltree = ET.parse(xmlpath) entities = list() inl_factory = factories.INLFactory() for record in xmltree.getroot(): - inl_parser = parsers.INLXmlParser(record) + inl_parser = parsers.INLXmlParser(record, whitelist) clean_record = inl_parser.clearxml() entities.append(inl_factory.get_entity(clean_record)) for entity in entities: - entity. print_entity() + print(entity) diff --git a/writers/__init__.py b/writers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/writers/wd_writer.py b/writers/wd_writer.py new file mode 100644 index 0000000..b88833f --- /dev/null +++ b/writers/wd_writer.py @@ -0,0 +1,6 @@ +import pywikibot +from pywikibot import pagegenerators, WikidataBot + +class WDWriter(object): + def __init__(self): + pass \ No newline at end of file -- cgit v1.2.3