StanAngeloff · November 7, 2023 13:53 · StanAngeloff · Nov 7, 2023
diff --git a/pyannote.ipynb b/pyannote.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "name": "pyannote.ipynb",
      "gpuType": "A100",
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/StanAngeloff/91480fac18a74d8aff3e4cf566cfd0ff/pyannote.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import locale\n",
        "\n",
        "def getpreferredencoding(do_setlocale = True):\n",
        "    return \"UTF-8\"\n",
        "\n",
        "locale.getpreferredencoding = getpreferredencoding"
      ],
      "metadata": {
        "id": "rudtRKL9P7KK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lbtdzoCjO7DU"
      },
      "outputs": [],
      "source": [
        "!pip install \\\n",
        "  git+https://github.com/pyannote/pyannote-audio.git@7379f1c82be093078354449100e1a84cbdfbafdf \\\n",
        "  git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 \\\n",
        "  torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 \\\n",
        "    --extra-index-url https://download.pytorch.org/whl/cu118"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "\n",
        "torch.cuda.is_available()"
      ],
      "metadata": {
        "id": "J_Ss89GhRlvq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "\n",
        "drive.mount('/content/gdrive')"
      ],
      "metadata": {
        "id": "10-Kt8ghQFK1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import notebook_login\n",
        "\n",
        "notebook_login()"
      ],
      "metadata": {
        "id": "KOvvVyKSPJck"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import soundfile as sf\n",
        "\n",
        "sf.available_formats()"
      ],
      "metadata": {
        "id": "oNpve47cPJ9n"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "---"
      ],
      "metadata": {
        "id": "75IRpeQnxJYW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import whisper\n",
        "import torch\n",
        "\n",
        "device = torch.device(\"cuda\")\n",
        "\n",
        "model = whisper.load_model(\"large\", device=device)"
      ],
      "metadata": {
        "id": "x0CLqnvDwg8P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pyannote.audio import Pipeline\n",
        "from pyannote.audio.pipelines.utils.hook import ProgressHook\n",
        "\n",
        "#device = torch.device(\"cuda\")\n",
        "\n",
        "speaker_diarization = Pipeline.from_pretrained(\n",
        "    \"pyannote/[email protected]\",\n",
        "    use_auth_token=True\n",
        ")\n",
        "\n",
        "speaker_diarization.to(device)"
      ],
      "metadata": {
        "id": "3rKnEyruPKwu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!nvidia-smi"
      ],
      "metadata": {
        "id": "yzH_lXFOv2ur"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!cp \"/content/gdrive/MyDrive/Recordings/Day 1/Session 1.ogg\" /content/target.ogg"
      ],
      "metadata": {
        "id": "CHGdyOCFxCiQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "---"
      ],
      "metadata": {
        "id": "05L9PUV9xb8k"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "with ProgressHook() as hook:\n",
        "    who_speaks_when = speaker_diarization(\n",
        "        \"/content/target.ogg\",\n",
        "        num_speakers=2,\n",
        "       #min_speakers=5,\n",
        "       #max_speakers=9,\n",
        "        hook=hook\n",
        "    )"
      ],
      "metadata": {
        "id": "PEH4cv5VPMfC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pyannote.core import Segment\n",
        "from pyannote.audio import Audio\n",
        "\n",
        "speakers = who_speaks_when.rename_labels({\n",
        "    #\"SPEAKER_00\": \"David\",\n",
        "    #\"SPEAKER_01\": \"Stan\",\n",
        "})\n",
        "\n",
        "crop = Segment(0, 999999999)\n",
        "#crop = Segment(5 * 60.0, 10 * 60.0)\n",
        "audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
        "\n",
        "def float_to_timestamp(float_time):\n",
        "    hours, remainder = divmod(float_time, 3600)\n",
        "    minutes, seconds = divmod(remainder, 60)\n",
        "    return \"{:02}:{:02}:{:04.1f}\".format(int(hours), int(minutes), seconds)\n",
        "\n",
        "for segment, _, speaker in speakers.crop(crop).itertracks(yield_label=True):\n",
        "    waveform, sample_rate = audio.crop(\"/content/target.ogg\", segment)\n",
        "    text = model.transcribe(waveform.squeeze().numpy(), language=\"en\", initial_prompt=\"\"\"\n",
        "A recorded conversation between Company A consisting of Stan & David and Client consisting of … discussing a new project requirements and demonstrating … and current standard operating procedures in …, including Campaign Planning and Digital Marketing, Customer Service, IT Process, Finance.\n",
        "\"\"\".strip())[\"text\"]\n",
        "    print(f\"{float_to_timestamp(segment.start)}-{float_to_timestamp(segment.end)} {speaker.strip()}: {text.strip()}\")"
      ],
      "metadata": {
        "id": "JbC9Qj0k5fuF"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"name": "pyannote.ipynb",
	"gpuType": "A100",
	"machine_shape": "hm",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/StanAngeloff/91480fac18a74d8aff3e4cf566cfd0ff/pyannote.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import locale\n",
	"\n",
	"def getpreferredencoding(do_setlocale = True):\n",
	" return \"UTF-8\"\n",
	"\n",
	"locale.getpreferredencoding = getpreferredencoding"
	],
	"metadata": {
	"id": "rudtRKL9P7KK"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "lbtdzoCjO7DU"
	},
	"outputs": [],
	"source": [
	"!pip install \\\n",
	" git+https://github.com/pyannote/pyannote-audio.git@7379f1c82be093078354449100e1a84cbdfbafdf \\\n",
	" git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 \\\n",
	" torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 \\\n",
	" --extra-index-url https://download.pytorch.org/whl/cu118"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import torch\n",
	"\n",
	"torch.cuda.is_available()"
	],
	"metadata": {
	"id": "J_Ss89GhRlvq"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from google.colab import drive\n",
	"\n",
	"drive.mount('/content/gdrive')"
	],
	"metadata": {
	"id": "10-Kt8ghQFK1"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from huggingface_hub import notebook_login\n",
	"\n",
	"notebook_login()"
	],
	"metadata": {
	"id": "KOvvVyKSPJck"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import soundfile as sf\n",
	"\n",
	"sf.available_formats()"
	],
	"metadata": {
	"id": "oNpve47cPJ9n"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"---"
	],
	"metadata": {
	"id": "75IRpeQnxJYW"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"import whisper\n",
	"import torch\n",
	"\n",
	"device = torch.device(\"cuda\")\n",
	"\n",
	"model = whisper.load_model(\"large\", device=device)"
	],
	"metadata": {
	"id": "x0CLqnvDwg8P"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from pyannote.audio import Pipeline\n",
	"from pyannote.audio.pipelines.utils.hook import ProgressHook\n",
	"\n",
	"#device = torch.device(\"cuda\")\n",
	"\n",
	"speaker_diarization = Pipeline.from_pretrained(\n",
	" \"pyannote/[email protected]\",\n",
	" use_auth_token=True\n",
	")\n",
	"\n",
	"speaker_diarization.to(device)"
	],
	"metadata": {
	"id": "3rKnEyruPKwu"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!nvidia-smi"
	],
	"metadata": {
	"id": "yzH_lXFOv2ur"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!cp \"/content/gdrive/MyDrive/Recordings/Day 1/Session 1.ogg\" /content/target.ogg"
	],
	"metadata": {
	"id": "CHGdyOCFxCiQ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"---"
	],
	"metadata": {
	"id": "05L9PUV9xb8k"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"with ProgressHook() as hook:\n",
	" who_speaks_when = speaker_diarization(\n",
	" \"/content/target.ogg\",\n",
	" num_speakers=2,\n",
	" #min_speakers=5,\n",
	" #max_speakers=9,\n",
	" hook=hook\n",
	" )"
	],
	"metadata": {
	"id": "PEH4cv5VPMfC"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from pyannote.core import Segment\n",
	"from pyannote.audio import Audio\n",
	"\n",
	"speakers = who_speaks_when.rename_labels({\n",
	" #\"SPEAKER_00\": \"David\",\n",
	" #\"SPEAKER_01\": \"Stan\",\n",
	"})\n",
	"\n",
	"crop = Segment(0, 999999999)\n",
	"#crop = Segment(5 * 60.0, 10 * 60.0)\n",
	"audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
	"\n",
	"def float_to_timestamp(float_time):\n",
	" hours, remainder = divmod(float_time, 3600)\n",
	" minutes, seconds = divmod(remainder, 60)\n",
	" return \"{:02}:{:02}:{:04.1f}\".format(int(hours), int(minutes), seconds)\n",
	"\n",
	"for segment, _, speaker in speakers.crop(crop).itertracks(yield_label=True):\n",
	" waveform, sample_rate = audio.crop(\"/content/target.ogg\", segment)\n",
	" text = model.transcribe(waveform.squeeze().numpy(), language=\"en\", initial_prompt=\"\"\"\n",
	"A recorded conversation between Company A consisting of Stan & David and Client consisting of … discussing a new project requirements and demonstrating … and current standard operating procedures in …, including Campaign Planning and Digital Marketing, Customer Service, IT Process, Finance.\n",
	"\"\"\".strip())[\"text\"]\n",
	" print(f\"{float_to_timestamp(segment.start)}-{float_to_timestamp(segment.end)} {speaker.strip()}: {text.strip()}\")"
	],
	"metadata": {
	"id": "JbC9Qj0k5fuF"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}