From 587c722cccc09659abc1bf046183207a482cc827 Mon Sep 17 00:00:00 2001 From: gilad_ilsar Date: Wed, 21 Sep 2016 20:50:13 +0300 Subject: created the INL xml parser --- .idea/vcs.xml | 6 ++++++ parsers/INL_xml_parser.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 .idea/vcs.xml create mode 100644 parsers/INL_xml_parser.py diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py new file mode 100644 index 0000000..57ceebd --- /dev/null +++ b/parsers/INL_xml_parser.py @@ -0,0 +1,35 @@ +import xml.etree.cElementTree as ET + + +class INLXmlParser: + def __init__(self, lst, path): + self.whitelist = lst + self.xmlpath = path + + def clearxml(self): + xmltree = ET.parse(self.xmlpath) + # root == list of records + root = xmltree.getroot() + + # create new data + newTreeRoot = ET.Element('data') + + # scan the datafields in the records and copy to the new one only the tags in the whitelist + for record in root: + # create new record + newRecord = ET.SubElement(newTreeRoot, 'record') + for field in record: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + newFieldTag = fieldtag + # tag 700 and 400 are the same + if newFieldTag == '700': + newFieldTag = '400' + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + for data in field: + subData = ET.SubElement(newTag, data.tag, data.attrib) + subData.text = data.text + + newTree = ET.ElementTree(newTreeRoot) + newTree.write('C:/Users/Ilsar/Documents/datahack/outTest.xml') + return newTree -- cgit v1.2.3