manics · July 13, 2025 22:14
diff --git a/k8tre-demo-audiotrack.ipynb b/k8tre-demo-audiotrack.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13e39ceb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import wave\n",
    "\n",
    "# boto3.setup_default_session(profile_name=\"...\")\n",
    "polly = boto3.client(\"polly\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a670d769",
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = [\n",
    "  \"CATER is designed using infrastructure-as-code, and is deployed using a git-ops workflow. We\"re using ArgoCD which is an open-source git-ops tools designed for Kubernetes. This means all infrastructure is fully reproducible, and all approved code updates are automatically deployed by ArgoCD.\",\n",
    "  \"CATER consists of a set of applications (or components). By default we install everything required to run a T.R.E., but all components can be disabled or replaced by another implementation.\",\n",
    "  \"For the first three months work has focussed on this backend work of writing and deploying components, but we\"ve now started integrating those components so they can be used through a frontend.\",\n",
    "\n",
    "  \"You login to CATER using Keycloak. A username and password are used in this demo, but multi-factor authentication can be easily added, and Keycloak can federate with other identity providers using SAML or O.I.C.D.\",\n",
    "  \"In this demo we\"re using JupyterHub as a control plane for researcher workspaces.\",\n",
    "  \"You can see a list of projects and workspace types. This demo only has Ubuntu Mate desktops.\",\n",
    "  \"When you launch a workspace a new Kubernetes pod is created, project storage is mounted, and users are given access via Apache Guacamole which is an open-source remote desktop gateway.\",\n",
    "  \"As you can see you have a full desktop via a web browser\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf70124e",
   "metadata": {},
   "outputs": [],
   "source": [
    "voices = polly.describe_voices()[\"Voices\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47c5c74e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# voice = \"Brian\"\n",
    "voice = \"Amy\"\n",
    "# voice = \"Emma\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d166646",
   "metadata": {},
   "outputs": [],
   "source": [
    "def speak(text, outfile):\n",
    "    r = polly.synthesize_speech(\n",
    "      Engine=\"neural\",\n",
    "      LanguageCode=\"en-GB\",\n",
    "      OutputFormat=\"pcm\",\n",
    "      Text=text,\n",
    "      VoiceId=voice,\n",
    "    )\n",
    "    s = r[\"AudioStream\"]\n",
    "\n",
    "    with wave.open(outfile, \"wb\") as wav:\n",
    "        wav.setparams((1, 2, 16000, 0, \"NONE\", \"NONE\"))\n",
    "        wav.writeframes(s.read())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f62929a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, text in enumerate(lines):\n",
    "    print(text)\n",
    "    speak(text, f\"{i:02d}.wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5784e676-37aa-4e92-99bb-c34727329b32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07110973",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ffmpeg\n",
    "import os\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9afc0e1e-34d8-4928-bba8-4ceb89f5f57d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define input file paths\n",
    "audio_files = [\"00.wav\", \"01.wav\", \"02.wav\", \"03.wav\", \"04.wav\", \"05.wav\", \"06.wav\", \"07.wav\"]\n",
    "output_file = \"concatenated_with_gaps.wav\"\n",
    "# audio_codec = \"libmp3lame\" # Or \"aac\", \"pcm_s16le\", etc. based on desired output\n",
    "audio_codec = \"pcm_s16le\"\n",
    "\n",
    "# Define the gap duration in seconds\n",
    "START_DURATION = 2.0\n",
    "GAP_DURATION = 2.0\n",
    "END_DURATION = 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "752cf3cc-645a-4777-a535-89b2b6d761c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Step 1: Get audio file properties for silence generation ---\n",
    "# We need to match the sample rate and channel layout of the real audio files\n",
    "# to ensure consistent silence. We\"ll just probe the first audio file.\n",
    "def get_audio_properties(file_path):\n",
    "    \"\"\"\n",
    "    Gets sample rate and channel layout of an audio file using ffprobe.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        probe = ffmpeg.probe(file_path)\n",
    "        audio_stream = next((s for s in probe[\"streams\"] if s[\"codec_type\"] == \"audio\"), None)\n",
    "        if audio_stream:\n",
    "            return {\n",
    "                \"sample_rate\": int(audio_stream[\"sample_rate\"]),\n",
    "                \"channel_layout\": audio_stream.get(\"channel_layout\", \"mono\"), # Default to mono if not found\n",
    "                \"duration\": float(audio_stream[\"duration\"]),\n",
    "            }\n",
    "        else:\n",
    "            raise ValueError(f\"No audio stream found for: {file_path}\")\n",
    "    except ffmpeg.Error as e:\n",
    "        print(f\"Error probing {file_path}: {e.stderr.decode()}\")\n",
    "        raise\n",
    "    except Exception as e:\n",
    "        print(f\"An unexpected error occurred while probing {file_path}: {e}\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0149e5cd-a562-458a-a2d7-4b9ff4a24144",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Properties for all audio files must match\n",
    "audio_props = [get_audio_properties(f) for f in audio_files]\n",
    "sample_rates = set(p[\"sample_rate\"] for p in audio_props)\n",
    "if len(sample_rates) != 1:\n",
    "    print(f\"Multiple sample rates found: {sample_rates}\")\n",
    "channel_layouts = set(p[\"channel_layout\"] for p in audio_props)\n",
    "if len(channel_layouts) != 1:\n",
    "    print(f\"Multiple channel layouts found: {channel_layouts}\")\n",
    "\n",
    "first_audio_props = audio_props[0]\n",
    "SAMPLE_RATE = next(iter(sample_rates))\n",
    "CHANNEL_LAYOUT =  next(iter(channel_layouts))\n",
    "\n",
    "print(f\"Using sample rate: {SAMPLE_RATE} Hz, channel layout: {CHANNEL_LAYOUT} for silence.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff6db6c4-e646-4e3b-8bcc-adf1699431f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Step 2: Build the filtergraph including silence ---\n",
    "graph_inputs = []\n",
    "\n",
    "# Insert gap at start\n",
    "silence_input = ffmpeg.input(\n",
    "    f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    "    f=\"lavfi\",\n",
    "    t=START_DURATION,\n",
    ").audio\n",
    "graph_inputs.append(silence_input)\n",
    "\n",
    "for i, audio_file in enumerate(audio_files):\n",
    "    graph_inputs.append(ffmpeg.input(audio_file).audio)\n",
    "\n",
    "    # Add silence between tracks\n",
    "    if i < len(audio_files) - 1:\n",
    "        silence_input = ffmpeg.input(\n",
    "            f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    "            f=\"lavfi\", # \"lavfi\" is for libavfilter inputs like anullsrc\n",
    "            t=GAP_DURATION,\n",
    "        ).audio\n",
    "        graph_inputs.append(silence_input)\n",
    "\n",
    "# Insert gap at end\n",
    "silence_input = ffmpeg.input(\n",
    "    f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
    "    f=\"lavfi\",\n",
    "    t=END_DURATION,\n",
    ").audio\n",
    "graph_inputs.append(silence_input)\n",
    "\n",
    "\n",
    "# Need to tell the concat filter how many inputs it has in total.\n",
    "total_inputs_for_concat = len(graph_inputs)\n",
    "\n",
    "# Apply the concat filter\n",
    "concatenated_audio = ffmpeg.filter(\n",
    "    graph_inputs,\n",
    "    \"concat\",\n",
    "    n=total_inputs_for_concat,\n",
    "    v=0, # No video streams\n",
    "    a=1  # One audio stream per input\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "360882f9-5fba-476f-a741-539e3df749cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Step 3: Define the output and run ---\n",
    "output_stream = ffmpeg.output(\n",
    "    concatenated_audio,\n",
    "    output_file,\n",
    "    acodec=audio_codec,\n",
    ")\n",
    "\n",
    "try:\n",
    "    ffmpeg.run(output_stream, overwrite_output=True)\n",
    "    print(f\"Audio files concatenated to {output_file} with {GAP_DURATION}-second gaps successfully!\")\n",
    "\n",
    "except ffmpeg.Error as e:\n",
    "    print(e)\n",
    "    raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e8f960a-afa5-49c0-b423-c9eabb6d1b51",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "13e39ceb",
	"metadata": {},
	"outputs": [],
	"source": [
	"import boto3\n",
	"import wave\n",
	"\n",
	"# boto3.setup_default_session(profile_name=\"...\")\n",
	"polly = boto3.client(\"polly\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a670d769",
	"metadata": {},
	"outputs": [],
	"source": [
	"lines = [\n",
	" \"CATER is designed using infrastructure-as-code, and is deployed using a git-ops workflow. We\"re using ArgoCD which is an open-source git-ops tools designed for Kubernetes. This means all infrastructure is fully reproducible, and all approved code updates are automatically deployed by ArgoCD.\",\n",
	" \"CATER consists of a set of applications (or components). By default we install everything required to run a T.R.E., but all components can be disabled or replaced by another implementation.\",\n",
	" \"For the first three months work has focussed on this backend work of writing and deploying components, but we\"ve now started integrating those components so they can be used through a frontend.\",\n",
	"\n",
	" \"You login to CATER using Keycloak. A username and password are used in this demo, but multi-factor authentication can be easily added, and Keycloak can federate with other identity providers using SAML or O.I.C.D.\",\n",
	" \"In this demo we\"re using JupyterHub as a control plane for researcher workspaces.\",\n",
	" \"You can see a list of projects and workspace types. This demo only has Ubuntu Mate desktops.\",\n",
	" \"When you launch a workspace a new Kubernetes pod is created, project storage is mounted, and users are given access via Apache Guacamole which is an open-source remote desktop gateway.\",\n",
	" \"As you can see you have a full desktop via a web browser\",\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "bf70124e",
	"metadata": {},
	"outputs": [],
	"source": [
	"voices = polly.describe_voices()[\"Voices\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "47c5c74e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# voice = \"Brian\"\n",
	"voice = \"Amy\"\n",
	"# voice = \"Emma\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6d166646",
	"metadata": {},
	"outputs": [],
	"source": [
	"def speak(text, outfile):\n",
	" r = polly.synthesize_speech(\n",
	" Engine=\"neural\",\n",
	" LanguageCode=\"en-GB\",\n",
	" OutputFormat=\"pcm\",\n",
	" Text=text,\n",
	" VoiceId=voice,\n",
	" )\n",
	" s = r[\"AudioStream\"]\n",
	"\n",
	" with wave.open(outfile, \"wb\") as wav:\n",
	" wav.setparams((1, 2, 16000, 0, \"NONE\", \"NONE\"))\n",
	" wav.writeframes(s.read())\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f62929a0",
	"metadata": {},
	"outputs": [],
	"source": [
	"for i, text in enumerate(lines):\n",
	" print(text)\n",
	" speak(text, f\"{i:02d}.wav\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "5784e676-37aa-4e92-99bb-c34727329b32",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "07110973",
	"metadata": {},
	"outputs": [],
	"source": [
	"import ffmpeg\n",
	"import os\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9afc0e1e-34d8-4928-bba8-4ceb89f5f57d",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Define input file paths\n",
	"audio_files = [\"00.wav\", \"01.wav\", \"02.wav\", \"03.wav\", \"04.wav\", \"05.wav\", \"06.wav\", \"07.wav\"]\n",
	"output_file = \"concatenated_with_gaps.wav\"\n",
	"# audio_codec = \"libmp3lame\" # Or \"aac\", \"pcm_s16le\", etc. based on desired output\n",
	"audio_codec = \"pcm_s16le\"\n",
	"\n",
	"# Define the gap duration in seconds\n",
	"START_DURATION = 2.0\n",
	"GAP_DURATION = 2.0\n",
	"END_DURATION = 2.0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "752cf3cc-645a-4777-a535-89b2b6d761c4",
	"metadata": {},
	"outputs": [],
	"source": [
	"# --- Step 1: Get audio file properties for silence generation ---\n",
	"# We need to match the sample rate and channel layout of the real audio files\n",
	"# to ensure consistent silence. We\"ll just probe the first audio file.\n",
	"def get_audio_properties(file_path):\n",
	" \"\"\"\n",
	" Gets sample rate and channel layout of an audio file using ffprobe.\n",
	" \"\"\"\n",
	" try:\n",
	" probe = ffmpeg.probe(file_path)\n",
	" audio_stream = next((s for s in probe[\"streams\"] if s[\"codec_type\"] == \"audio\"), None)\n",
	" if audio_stream:\n",
	" return {\n",
	" \"sample_rate\": int(audio_stream[\"sample_rate\"]),\n",
	" \"channel_layout\": audio_stream.get(\"channel_layout\", \"mono\"), # Default to mono if not found\n",
	" \"duration\": float(audio_stream[\"duration\"]),\n",
	" }\n",
	" else:\n",
	" raise ValueError(f\"No audio stream found for: {file_path}\")\n",
	" except ffmpeg.Error as e:\n",
	" print(f\"Error probing {file_path}: {e.stderr.decode()}\")\n",
	" raise\n",
	" except Exception as e:\n",
	" print(f\"An unexpected error occurred while probing {file_path}: {e}\")\n",
	" raise"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "0149e5cd-a562-458a-a2d7-4b9ff4a24144",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Properties for all audio files must match\n",
	"audio_props = [get_audio_properties(f) for f in audio_files]\n",
	"sample_rates = set(p[\"sample_rate\"] for p in audio_props)\n",
	"if len(sample_rates) != 1:\n",
	" print(f\"Multiple sample rates found: {sample_rates}\")\n",
	"channel_layouts = set(p[\"channel_layout\"] for p in audio_props)\n",
	"if len(channel_layouts) != 1:\n",
	" print(f\"Multiple channel layouts found: {channel_layouts}\")\n",
	"\n",
	"first_audio_props = audio_props[0]\n",
	"SAMPLE_RATE = next(iter(sample_rates))\n",
	"CHANNEL_LAYOUT = next(iter(channel_layouts))\n",
	"\n",
	"print(f\"Using sample rate: {SAMPLE_RATE} Hz, channel layout: {CHANNEL_LAYOUT} for silence.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ff6db6c4-e646-4e3b-8bcc-adf1699431f1",
	"metadata": {},
	"outputs": [],
	"source": [
	"# --- Step 2: Build the filtergraph including silence ---\n",
	"graph_inputs = []\n",
	"\n",
	"# Insert gap at start\n",
	"silence_input = ffmpeg.input(\n",
	" f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
	" f=\"lavfi\",\n",
	" t=START_DURATION,\n",
	").audio\n",
	"graph_inputs.append(silence_input)\n",
	"\n",
	"for i, audio_file in enumerate(audio_files):\n",
	" graph_inputs.append(ffmpeg.input(audio_file).audio)\n",
	"\n",
	" # Add silence between tracks\n",
	" if i < len(audio_files) - 1:\n",
	" silence_input = ffmpeg.input(\n",
	" f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
	" f=\"lavfi\", # \"lavfi\" is for libavfilter inputs like anullsrc\n",
	" t=GAP_DURATION,\n",
	" ).audio\n",
	" graph_inputs.append(silence_input)\n",
	"\n",
	"# Insert gap at end\n",
	"silence_input = ffmpeg.input(\n",
	" f\"anullsrc=r={SAMPLE_RATE}:cl={CHANNEL_LAYOUT}\",\n",
	" f=\"lavfi\",\n",
	" t=END_DURATION,\n",
	").audio\n",
	"graph_inputs.append(silence_input)\n",
	"\n",
	"\n",
	"# Need to tell the concat filter how many inputs it has in total.\n",
	"total_inputs_for_concat = len(graph_inputs)\n",
	"\n",
	"# Apply the concat filter\n",
	"concatenated_audio = ffmpeg.filter(\n",
	" graph_inputs,\n",
	" \"concat\",\n",
	" n=total_inputs_for_concat,\n",
	" v=0, # No video streams\n",
	" a=1 # One audio stream per input\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "360882f9-5fba-476f-a741-539e3df749cb",
	"metadata": {},
	"outputs": [],
	"source": [
	"# --- Step 3: Define the output and run ---\n",
	"output_stream = ffmpeg.output(\n",
	" concatenated_audio,\n",
	" output_file,\n",
	" acodec=audio_codec,\n",
	")\n",
	"\n",
	"try:\n",
	" ffmpeg.run(output_stream, overwrite_output=True)\n",
	" print(f\"Audio files concatenated to {output_file} with {GAP_DURATION}-second gaps successfully!\")\n",
	"\n",
	"except ffmpeg.Error as e:\n",
	" print(e)\n",
	" raise"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "2e8f960a-afa5-49c0-b423-c9eabb6d1b51",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.13.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}