Skip to content

Instantly share code, notes, and snippets.

@vaclavdekanovsky
Last active November 1, 2020 21:50
Show Gist options
  • Save vaclavdekanovsky/b99c0196250e3d69851a12224d3e51af to your computer and use it in GitHub Desktop.
Save vaclavdekanovsky/b99c0196250e3d69851a12224d3e51af to your computer and use it in GitHub Desktop.
Unzip, update and zip again a set of XML files in a folder
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Unzip, update and zip XML in a folder using python"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-01T21:43:44.853838Z",
"start_time": "2020-11-01T21:43:44.836856Z"
}
},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"import zipfile\n",
"from xml.etree import ElementTree as ET\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-01T21:43:44.869804Z",
"start_time": "2020-11-01T21:43:44.854805Z"
}
},
"outputs": [],
"source": [
"source_folder = \"to_process\"\n",
"temp_folder = \"temp\"\n",
"output_folder = \"processed\"\n",
"\n",
"# values to which we update\n",
"new_prefix = \"updated\"\n",
"new_version = 3"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-01T21:43:44.885755Z",
"start_time": "2020-11-01T21:43:44.872769Z"
}
},
"outputs": [],
"source": [
"# create the output folder if it doesn't exists\n",
"Path(output_folder).mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-01T21:43:44.901680Z",
"start_time": "2020-11-01T21:43:44.887716Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['Test_001_20201027.zip', 'xyz_001_20201029.zip']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"archives_to_process = [f for f in os.listdir(source_folder) if f.endswith(\".zip\")]\n",
"archives_to_process"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-11-01T21:43:44.933598Z",
"start_time": "2020-11-01T21:43:44.904677Z"
}
},
"outputs": [],
"source": [
"for archive in archives_to_process:\n",
" \n",
" # unzip to temp folder\n",
" with zipfile.ZipFile(os.path.join(source_folder,archive), 'r') as zip_ref:\n",
" zip_ref.extractall(temp_folder)\n",
" \n",
" # iterate over the extracted files\n",
" for extracted_file in zip_ref.namelist():\n",
" extracted_path = os.path.join(temp_folder,extracted_file)\n",
" tree = ET.parse(extracted_path)\n",
" root = tree.getroot()\n",
" \n",
" # update the if node if exists\n",
" id = root.find(\"id\")\n",
" if id is not None:\n",
" split_id = id.text.split(\"_\")\n",
" split_id[0] = new_prefix\n",
" split_id[1] = \"{:03d}\".format(new_version)\n",
" new_id = \"_\".join(split_id)\n",
" root.find(\"id\").text = new_id\n",
" \n",
" # cover the option that id was not found, because we use the id in naming the output zip\n",
" else: \n",
" new_id = \"_\".join([prefix,\"{:03d}\".format(new_version),datetime.today().strftime('%Y%m%d')])\n",
"\n",
" \n",
" tree.write(extracted_path)\n",
" \n",
" # zip again into the new_id.zip\n",
" output_path = os.path.join(output_folder, new_id+\".zip\")\n",
" # open archive for writing\n",
" with zipfile.ZipFile(output_path, 'w') as myzip:\n",
" # write our processed xml to it, under it's file name only (not the full path)\n",
" myzip.write(extracted_path, os.path.basename(extracted_path))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment