Last active
November 1, 2020 21:50
-
-
Save vaclavdekanovsky/b99c0196250e3d69851a12224d3e51af to your computer and use it in GitHub Desktop.
Unzip, update and zip again a set of XML files in a folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Unzip, update and zip XML in a folder using python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-11-01T21:43:44.853838Z", | |
"start_time": "2020-11-01T21:43:44.836856Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from pathlib import Path\n", | |
"import zipfile\n", | |
"from xml.etree import ElementTree as ET\n", | |
"from datetime import datetime" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-11-01T21:43:44.869804Z", | |
"start_time": "2020-11-01T21:43:44.854805Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"source_folder = \"to_process\"\n", | |
"temp_folder = \"temp\"\n", | |
"output_folder = \"processed\"\n", | |
"\n", | |
"# values to which we update\n", | |
"new_prefix = \"updated\"\n", | |
"new_version = 3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-11-01T21:43:44.885755Z", | |
"start_time": "2020-11-01T21:43:44.872769Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# create the output folder if it doesn't exists\n", | |
"Path(output_folder).mkdir(parents=True, exist_ok=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-11-01T21:43:44.901680Z", | |
"start_time": "2020-11-01T21:43:44.887716Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Test_001_20201027.zip', 'xyz_001_20201029.zip']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"archives_to_process = [f for f in os.listdir(source_folder) if f.endswith(\".zip\")]\n", | |
"archives_to_process" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-11-01T21:43:44.933598Z", | |
"start_time": "2020-11-01T21:43:44.904677Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"for archive in archives_to_process:\n", | |
" \n", | |
" # unzip to temp folder\n", | |
" with zipfile.ZipFile(os.path.join(source_folder,archive), 'r') as zip_ref:\n", | |
" zip_ref.extractall(temp_folder)\n", | |
" \n", | |
" # iterate over the extracted files\n", | |
" for extracted_file in zip_ref.namelist():\n", | |
" extracted_path = os.path.join(temp_folder,extracted_file)\n", | |
" tree = ET.parse(extracted_path)\n", | |
" root = tree.getroot()\n", | |
" \n", | |
" # update the if node if exists\n", | |
" id = root.find(\"id\")\n", | |
" if id is not None:\n", | |
" split_id = id.text.split(\"_\")\n", | |
" split_id[0] = new_prefix\n", | |
" split_id[1] = \"{:03d}\".format(new_version)\n", | |
" new_id = \"_\".join(split_id)\n", | |
" root.find(\"id\").text = new_id\n", | |
" \n", | |
" # cover the option that id was not found, because we use the id in naming the output zip\n", | |
" else: \n", | |
" new_id = \"_\".join([prefix,\"{:03d}\".format(new_version),datetime.today().strftime('%Y%m%d')])\n", | |
"\n", | |
" \n", | |
" tree.write(extracted_path)\n", | |
" \n", | |
" # zip again into the new_id.zip\n", | |
" output_path = os.path.join(output_folder, new_id+\".zip\")\n", | |
" # open archive for writing\n", | |
" with zipfile.ZipFile(output_path, 'w') as myzip:\n", | |
" # write our processed xml to it, under it's file name only (not the full path)\n", | |
" myzip.write(extracted_path, os.path.basename(extracted_path))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment