diff options
author | roy lewin <roy.lewin@gmail.com> | 2016-09-22 01:43:21 +0300 |
---|---|---|
committer | roy lewin <roy.lewin@gmail.com> | 2016-09-22 01:43:21 +0300 |
commit | 652781137f3856fef98e3063766f9f3b1a984a2e (patch) | |
tree | 88517562c6193cbc48f1d0a044bd6dd97d3a8549 | |
parent | 6beb87f2720b905c6b512a977ba2422fa183a832 (diff) |
Added xml_reader.py, and edited INL_xml_parser to work on a per-record basis
-rw-r--r-- | .idea/misc.xml | 3 | ||||
-rw-r--r-- | .idea/workspace.xml | 213 | ||||
-rw-r--r-- | __init__.py | 0 | ||||
-rw-r--r-- | parsers/INL_xml_parser.py | 47 | ||||
-rw-r--r-- | parsers/__init__.py | 1 | ||||
-rw-r--r-- | readers/__init__.py | 0 | ||||
-rw-r--r-- | readers/xml_reader.py | 31 |
7 files changed, 236 insertions, 59 deletions
diff --git a/.idea/misc.xml b/.idea/misc.xml index de9bbc8..f38dd77 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (C:\Python27\python.exe)" project-jdk-type="Python SDK" /> + <component name="PythonCompatibilityInspectionAdvertiser"> + <option name="version" value="1" /> + </component> </project>
\ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 1a3da85..73e2873 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,7 +1,15 @@ <?xml version="1.0" encoding="UTF-8"?> <project version="4"> <component name="ChangeListManager"> - <list default="true" id="1d9b5e9b-4282-4345-b663-d1b92a287a32" name="Default" comment="" /> + <list default="true" id="1d9b5e9b-4282-4345-b663-d1b92a287a32" name="Default" comment=""> + <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/__init__.py" /> + <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/readers/__init__.py" /> + <change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/readers/xml_reader.py" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/misc.xml" afterPath="$PROJECT_DIR$/.idea/misc.xml" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/parsers/INL_xml_parser.py" afterPath="$PROJECT_DIR$/parsers/INL_xml_parser.py" /> + <change type="MODIFICATION" beforePath="$PROJECT_DIR$/parsers/__init__.py" afterPath="$PROJECT_DIR$/parsers/__init__.py" /> + </list> <ignored path="lib2wiki.iws" /> <ignored path=".idea/workspace.xml" /> <ignored path=".idea/dataSources.local.xml" /> @@ -12,6 +20,9 @@ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> <option name="LAST_RESOLUTION" value="IGNORE" /> </component> + <component name="CoverageDataManager"> + <SUITE FILE_PATH="coverage/lib2wiki$Unnamed.coverage" NAME="xml_reader Coverage Results" MODIFIED="1474497479052" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/readers" /> + </component> <component name="CreatePatchCommitExecutor"> <option name="PATCH_PATH" value="" /> </component> @@ -31,11 +42,11 @@ </provider> </entry> </file> - <file leaf-file-name="INL_factory.py" pinned="false" current-in-tab="true"> + <file leaf-file-name="INL_factory.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/factories/INL_factory.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="255"> - <caret line="15" column="31" selection-start-line="15" selection-start-column="31" selection-end-line="15" selection-end-column="31" /> + <state relative-caret-position="289"> + <caret line="17" column="33" selection-start-line="17" selection-start-column="33" selection-end-line="17" selection-end-column="33" /> <folding> <element signature="e#0#15#0" expanded="true" /> </folding> @@ -43,6 +54,48 @@ </provider> </entry> </file> + <file leaf-file-name="xml_reader.py" pinned="false" current-in-tab="true"> + <entry file="file://$PROJECT_DIR$/readers/xml_reader.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="282"> + <caret line="25" column="70" selection-start-line="25" selection-start-column="70" selection-end-line="25" selection-end-column="70" /> + <folding> + <element signature="e#0#38#0" expanded="true" /> + </folding> + </state> + </provider> + </entry> + </file> + <file leaf-file-name="__init__.py" pinned="false" current-in-tab="false"> + <entry file="file://$PROJECT_DIR$/parsers/__init__.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="0"> + <caret line="0" column="5" selection-start-line="0" selection-start-column="5" selection-end-line="0" selection-end-column="5" /> + <folding /> + </state> + </provider> + </entry> + </file> + <file leaf-file-name="person.py" pinned="false" current-in-tab="false"> + <entry file="file://$PROJECT_DIR$/entities/person.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="68"> + <caret line="4" column="57" selection-start-line="4" selection-start-column="44" selection-end-line="4" selection-end-column="57" /> + <folding /> + </state> + </provider> + </entry> + </file> + <file leaf-file-name="INL_xml_parser.py" pinned="false" current-in-tab="false"> + <entry file="file://$PROJECT_DIR$/parsers/INL_xml_parser.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="299"> + <caret line="28" column="66" selection-start-line="28" selection-start-column="33" selection-end-line="28" selection-end-column="66" /> + <folding /> + </state> + </provider> + </entry> + </file> <file leaf-file-name="json_tools.py" pinned="false" current-in-tab="false"> <entry file="file://$PROJECT_DIR$/libs/json_tools.py"> <provider selected="true" editor-type-id="text-editor"> @@ -80,6 +133,10 @@ <option value="$PROJECT_DIR$/entities/institution.py" /> <option value="$PROJECT_DIR$/factories/INL_factory.py" /> <option value="$PROJECT_DIR$/.gitignore" /> + <option value="$PROJECT_DIR$/parsers/__init__.py" /> + <option value="$PROJECT_DIR$/__init__.py" /> + <option value="$PROJECT_DIR$/parsers/INL_xml_parser.py" /> + <option value="$PROJECT_DIR$/readers/xml_reader.py" /> </list> </option> </component> @@ -137,6 +194,30 @@ <option name="myItemId" value="lib2wiki" /> <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> </PATH_ELEMENT> + </PATH> + <PATH> + <PATH_ELEMENT> + <option name="myItemId" value="lib2wiki" /> + <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> + </PATH_ELEMENT> + <PATH_ELEMENT> + <option name="myItemId" value="lib2wiki" /> + <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> + </PATH_ELEMENT> + <PATH_ELEMENT> + <option name="myItemId" value="readers" /> + <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> + </PATH_ELEMENT> + </PATH> + <PATH> + <PATH_ELEMENT> + <option name="myItemId" value="lib2wiki" /> + <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> + </PATH_ELEMENT> + <PATH_ELEMENT> + <option name="myItemId" value="lib2wiki" /> + <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> + </PATH_ELEMENT> <PATH_ELEMENT> <option name="myItemId" value="parsers" /> <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> @@ -191,10 +272,17 @@ </panes> </component> <component name="PropertiesComponent"> - <property name="last_opened_file_path" value="$PROJECT_DIR$" /> + <property name="last_opened_file_path" value="$PROJECT_DIR$/readers/xml_reader.py" /> <property name="WebServerToolWindowFactoryState" value="false" /> + <property name="settings.editor.selected.configurable" value="editing.templates" /> + <property name="settings.editor.splitter.proportion" value="0.2" /> + </component> + <component name="RecentsManager"> + <key name="CopyFile.RECENT_KEYS"> + <recent name="C:\roy\lib2wiki\readers" /> + </key> </component> - <component name="RunManager"> + <component name="RunManager" selected="Python.xml_reader"> <configuration default="true" type="DjangoTestsConfigurationType" factoryName="Django tests"> <option name="INTERPRETER_OPTIONS" value="" /> <option name="PARENT_ENVS" value="true" /> @@ -397,6 +485,27 @@ <option name="USE_KEYWORD" value="false" /> <method /> </configuration> + <configuration default="false" name="xml_reader" type="PythonConfigurationType" factoryName="Python"> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="C:\Python27\python.exe" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/readers" /> + <option name="IS_MODULE_SDK" value="false" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <module name="lib2wiki" /> + <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/readers/xml_reader.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <method /> + </configuration> + <list size="1"> + <item index="0" class="java.lang.String" itemvalue="Python.xml_reader" /> + </list> </component> <component name="ShelveChangesManager" show_recycled="false"> <option name="remove_strategy" value="false" /> @@ -413,26 +522,26 @@ </component> <component name="ToolWindowManager"> <frame x="-8" y="-8" width="1616" height="876" extended-state="6" /> - <editor active="false" /> + <editor active="true" /> <layout> - <window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.21625" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" /> - <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" /> - <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" /> - <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" /> - <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" /> - <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" /> - <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> - <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" /> - <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" /> - <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" /> - <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" /> - <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> - <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> - <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> - <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" /> - <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> - <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> - <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" /> + <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.15625" sideWeight="0.5" order="1" side_tool="false" content_ui="combo" /> + <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" /> + <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" /> + <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> + <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> + <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> + <window_info id="Run" active="true" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.32898173" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" /> + <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> + <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" /> + <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" /> + <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" /> + <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" /> + <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="3" side_tool="false" content_ui="combo" /> + <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" /> + <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> + <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" /> + <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="9" side_tool="false" content_ui="tabs" /> + <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> </layout> </component> <component name="Vcs.Log.UiProperties"> @@ -495,14 +604,6 @@ </state> </provider> </entry> - <entry file="file://$PROJECT_DIR$/entities/person.py"> - <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="136"> - <caret line="8" column="29" selection-start-line="8" selection-start-column="29" selection-end-line="8" selection-end-column="29" /> - <folding /> - </state> - </provider> - </entry> <entry file="file://$PROJECT_DIR$/entities/location.py"> <provider selected="true" editor-type-id="text-editor"> <state relative-caret-position="119"> @@ -545,13 +646,55 @@ </entry> <entry file="file://$PROJECT_DIR$/factories/INL_factory.py"> <provider selected="true" editor-type-id="text-editor"> - <state relative-caret-position="255"> - <caret line="15" column="31" selection-start-line="15" selection-start-column="31" selection-end-line="15" selection-end-column="31" /> + <state relative-caret-position="289"> + <caret line="17" column="33" selection-start-line="17" selection-start-column="33" selection-end-line="17" selection-end-column="33" /> <folding> <element signature="e#0#15#0" expanded="true" /> </folding> </state> </provider> </entry> + <entry file="file://$PROJECT_DIR$/entities/person.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="68"> + <caret line="4" column="57" selection-start-line="4" selection-start-column="44" selection-end-line="4" selection-end-column="57" /> + <folding /> + </state> + </provider> + </entry> + <entry file="file://$PROJECT_DIR$/parsers/__init__.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="0"> + <caret line="0" column="5" selection-start-line="0" selection-start-column="5" selection-end-line="0" selection-end-column="5" /> + <folding /> + </state> + </provider> + </entry> + <entry file="file://$PROJECT_DIR$/__init__.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="0"> + <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" /> + <folding /> + </state> + </provider> + </entry> + <entry file="file://$PROJECT_DIR$/parsers/INL_xml_parser.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="299"> + <caret line="28" column="66" selection-start-line="28" selection-start-column="33" selection-end-line="28" selection-end-column="66" /> + <folding /> + </state> + </provider> + </entry> + <entry file="file://$PROJECT_DIR$/readers/xml_reader.py"> + <provider selected="true" editor-type-id="text-editor"> + <state relative-caret-position="282"> + <caret line="25" column="70" selection-start-line="25" selection-start-column="70" selection-end-line="25" selection-end-column="70" /> + <folding> + <element signature="e#0#38#0" expanded="true" /> + </folding> + </state> + </provider> + </entry> </component> </project>
\ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/__init__.py diff --git a/parsers/INL_xml_parser.py b/parsers/INL_xml_parser.py index 2ea9a9b..512d46e 100644 --- a/parsers/INL_xml_parser.py +++ b/parsers/INL_xml_parser.py @@ -1,34 +1,33 @@ -import xml.etree.cElementTree as ET +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +KNOWN_FIELD_TAGS = ['100', '110', '151'] class INLXmlParser: - def __init__(self, lst, path): - self.whitelist = lst - self.xmlpath = path + def __init__(self, reader, whitelist=None): + self.reader = reader + self.whitelist = whitelist or KNOWN_FIELD_TAGS def clearxml(self): - xmltree = ET.parse(self.xmlpath) - # root == list of records - root = xmltree.getroot() - - # create new data newTreeRoot = ET.Element('data') - # scan the datafields in the records and copy to the new one only the tags in the whitelist - for record in root: - # create new record - newRecord = ET.SubElement(newTreeRoot, 'record') - for field in record: - fieldtag = field.attrib.get('tag') - if fieldtag in self.whitelist: - newFieldTag = fieldtag - # tag 700 and 400 are the same - if newFieldTag == '700': - newFieldTag = '400' - newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) - for data in field: - subData = ET.SubElement(newTag, data.tag, data.attrib) - subData.text = data.text + # # scan the datafields in the records and copy to the new one only the tags in the whitelist + # for record in root: # create new record + newRecord = ET.SubElement(newTreeRoot, 'record') + for field in self.reader: + fieldtag = field.attrib.get('tag') + if fieldtag in self.whitelist: + newFieldTag = fieldtag + # tag 700 and 400 are the same + if newFieldTag == '700': + newFieldTag = '400' + newTag = ET.SubElement(newRecord, 'datafield', {'tag': newFieldTag}) + for data in field: + subData = ET.SubElement(newTag, data.tag, data.attrib) + subData.text = data.text newTree = ET.ElementTree(newTreeRoot) return newTree diff --git a/parsers/__init__.py b/parsers/__init__.py index e69de29..e3a246d 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -0,0 +1 @@ +from INL_xml_parser import INLXmlParser
\ No newline at end of file diff --git a/readers/__init__.py b/readers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/readers/__init__.py diff --git a/readers/xml_reader.py b/readers/xml_reader.py new file mode 100644 index 0000000..8a819b0 --- /dev/null +++ b/readers/xml_reader.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import +import parsers + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + + +def read_file(path, element_key): + # get an iterable + record_counter = 0 + context = ET.iterparse(path, events=("start", "end")) + + # turn it into an iterator + context = iter(context) + + # get the root element + event, root = context.next() + + for event, element in context: + if 'end' in event: + if element_key in element.tag: + record_counter += 1 + cleaned_element = parsers.INLXmlParser(element).clearxml() + print record_counter, cleaned_element.getroot().attrib + element.clear() + + +if __name__ == '__main__': + read_file(r"../../NLI-nnl10.xml", 'record') |