huseinzol05 · February 13, 2025 03:21
diff --git a/noob-parquet-datasets.ipynb b/noob-parquet-datasets.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "04e53621-231b-48ff-846a-eac8d44c2536",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml-researcher/.local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Fetching 209 files:   0%|▎                                                                            | 1/209 [00:51<2:58:40, 51.54s/it]\n",
      "\n",
      "KeyboardInterrupt\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "snapshot_download(repo_id=\"hendrick-chong-02/malaysian-chinese-youtube\", \n",
    "                  allow_patterns=[\"2025-02-06 01:30:41/*\"],\n",
    "                  repo_type = 'dataset', local_dir = './')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "016b203a-10fd-4419-9e4d-a7e9e10f1035",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from glob import glob\n",
    "from torch.utils.data import DataLoader, Dataset\n",
    "from datasets import Audio\n",
    "import pyarrow.parquet as pq\n",
    "\n",
    "def get_parquet_row_count(file_path):\n",
    "    parquet_file = pq.ParquetFile(file_path)\n",
    "    return parquet_file.metadata.num_rows\n",
    "    \n",
    "files = sorted(glob('*/*.parquet'))\n",
    "len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "6199b3a5-5d64-4f8b-ad19-3a85ce3fb130",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.11 ms, sys: 3.13 ms, total: 12.2 ms\n",
      "Wall time: 11.3 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "global_indices = {}\n",
    "start = 0\n",
    "for f in files:\n",
    "    row_size = get_parquet_row_count(f)\n",
    "    row = {\n",
    "        'start': start,\n",
    "        'end': row_size,\n",
    "        'filename': f\n",
    "    }\n",
    "    row['start'] = start\n",
    "    row['end'] = row_size\n",
    "    global_indices[start] = row\n",
    "    start += row_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "40dafe18-526f-448d-b913-3108e0bc8936",
   "metadata": {},
   "outputs": [],
   "source": [
    "class ParquetDataset(Dataset):\n",
    "    def __init__(self, indices, maxlen_cache_df=5):\n",
    "        self.indices = {}\n",
    "        for k, v in indices.items():\n",
    "            for i in range(int(k), v['start'] + v['end'], 1):\n",
    "                self.indices[i] = v\n",
    "        \n",
    "        self.max_index = len(self.indices)\n",
    "        self.cache_df = {}\n",
    "        self.maxlen_cache_df = maxlen_cache_df\n",
    "        self.audio = Audio(sampling_rate=16000)\n",
    "    \n",
    "    def __len__(self):\n",
    "        return self.max_index\n",
    "    \n",
    "    def __getitem__(self, item):\n",
    "        if item < 0:\n",
    "            item = self.max_index + item\n",
    "\n",
    "        v = self.indices[item]\n",
    "        chunk_index = item - v['start']\n",
    "        if v['filename'] not in self.cache_df:\n",
    "            df = pd.read_parquet(v['filename'])\n",
    "            if len(self.cache_df) >= self.maxlen_cache_df:\n",
    "                keys = list(self.cache_df.keys())\n",
    "                self.cache_df.pop(sorted(keys)[0], None)\n",
    "            self.cache_df[v['filename']] = df\n",
    "        else:\n",
    "            df = self.cache_df[v['filename']]\n",
    "\n",
    "        row = df.iloc[chunk_index].to_dict()\n",
    "        audio = k = self.audio.decode_example(self.audio.encode_example(row['audio']))['array']\n",
    "        row['audio'] = audio\n",
    "        return row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "247b9abd-5dfb-47f2-bec4-76901cb6321f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35833"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = ParquetDataset(global_indices)\n",
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "d7bcc900-f8e8-4c52-ad1f-4fd044194340",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 223 ms, sys: 1.56 s, total: 1.78 s\n",
      "Wall time: 812 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'audio': array([-0.00125122, -0.00079346,  0.        , ..., -0.00183105,\n",
       "         0.00012207,  0.00296021]),\n",
       " 'video id': '18RUjXWtV28',\n",
       " 'chunk number': 0,\n",
       " 'transcription': '我前,身份在身边好啦,是Good morning!今天是一个非常开心的一天看我春风满面就知道最近发生了两件很重要的事情要跟大家分享No.1Hepi在上个星期拿了人生第一张Driving license那一个下午当下我直接去买男生第一辆车买了新车回到家的第一个moment我打开门发生第二件很值得开心的事情Hepi一进家就发现一个包裹一打开包裹你知道是什么吗竟然是全新的Sony A7C',\n",
       " 'timestamp': \"[{'timestamp': (0.0, 1.54), 'text': '我前,身份在身边'}, {'timestamp': (1.54, 2.3), 'text': '好啦,是'}, {'timestamp': (2.3, 3.6), 'text': 'Good morning!'}, {'timestamp': (3.6, 5.3), 'text': '今天是一个非常开心的一天'}, {'timestamp': (5.3, 7.6), 'text': '看我春风满面就知道最近发生了'}, {'timestamp': (7.6, 9.6), 'text': '两件很重要的事情要跟大家分享'}, {'timestamp': (9.6, 10.14), 'text': 'No.1'}, {'timestamp': (10.14, 11.06), 'text': 'Hepi在上个星期'}, {'timestamp': (11.06, 12.56), 'text': '拿了人生第一张Driving license'}, {'timestamp': (12.56, 13.34), 'text': '那一个下午'}, {'timestamp': (13.34, 14.06), 'text': '当下'}, {'timestamp': (14.06, 14.64), 'text': '我直接'}, {'timestamp': (14.64, 16.34), 'text': '去买男生第一辆车'}, {'timestamp': (16.34, 18.24), 'text': '买了新车回到家的第一个moment'}, {'timestamp': (18.24, 19.14), 'text': '我打开门'}, {'timestamp': (19.14, 21.14), 'text': '发生第二件很值得开心的事情'}, {'timestamp': (21.14, 22.7), 'text': 'Hepi一进家就发现一个包裹'}, {'timestamp': (22.7, 24.44), 'text': '一打开包裹你知道是什么吗'}, {'timestamp': (24.44, 25.44), 'text': '竟然是'}, {'timestamp': (25.44, 28.02), 'text': '全新的Sony A7C'}]\"}"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "dataset[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "0fca609b-38ad-4086-853b-5205393c1e41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.19 ms, sys: 2.22 ms, total: 6.41 ms\n",
      "Wall time: 5.41 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'audio': array([ 7.32421875e-04, -6.10351562e-05, -1.12915039e-03, ...,\n",
       "        -5.13000488e-02, -4.47082520e-02, -3.10974121e-02]),\n",
       " 'video id': '18RUjXWtV28',\n",
       " 'chunk number': 1,\n",
       " 'transcription': '我只能講我太開心Sony來得太對時機了所以這一次的VlogHabby決定使用這台全新的A7C一邊拍攝一邊開上我的新車哈哈這樣子的話你可以在同一支影片裡面看到兩個的小寶貝這樣啊Ida cutBefore看車之前呢Habby先把今天這一集最廢的嘉賓請出來她是人稱書棒區美女收割機很像換女朋友喂喂哈哈哈哈女朋友你刷Credit看來會快書家說好壞',\n",
       " 'timestamp': \"[{'timestamp': (0.0, 1.6), 'text': '我只能講我太開心'}, {'timestamp': (1.6, 3.76), 'text': 'Sony來得太對時機了'}, {'timestamp': (3.76, 4.68), 'text': '所以這一次的Vlog'}, {'timestamp': (4.68, 7.5), 'text': 'Habby決定使用這台全新的A7C'}, {'timestamp': (7.5, 8.3), 'text': '一邊拍攝'}, {'timestamp': (8.3, 9.76), 'text': '一邊開上我的新車'}, {'timestamp': (9.76, 10.34), 'text': '哈哈'}, {'timestamp': (10.34, 11.0), 'text': '這樣子的話'}, {'timestamp': (11.0, 12.96), 'text': '你可以在同一支影片裡面看到'}, {'timestamp': (12.96, 14.5), 'text': '兩個的小寶貝'}, {'timestamp': (14.5, 15.2), 'text': '這樣啊'}, {'timestamp': (15.2, 15.94), 'text': 'Ida cut'}, {'timestamp': (15.94, 17.26), 'text': 'Before看車之前呢'}, {'timestamp': (17.26, 18.9), 'text': 'Habby先把今天這一集'}, {'timestamp': (18.9, 20.2), 'text': '最廢的嘉賓請出來'}, {'timestamp': (20.2, 23.2), 'text': '她是人稱書棒區美女收割機'}, {'timestamp': (23.2, 24.3), 'text': '很像換女朋友'}, {'timestamp': (24.3, 24.66), 'text': '喂喂'}, {'timestamp': (24.66, 25.44), 'text': '哈哈哈哈'}, {'timestamp': (25.44, 26.0), 'text': '女朋友'}, {'timestamp': (26.0, 27.2), 'text': '你刷Credit看來會快'}, {'timestamp': (27.2, 28.14), 'text': '書家說好壞'}]\"}"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "dataset[1001]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a5681b7-91e7-4f55-a9dc-8c1db4439bc7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "04e53621-231b-48ff-846a-eac8d44c2536",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/ml-researcher/.local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
	" from .autonotebook import tqdm as notebook_tqdm\n",
	"Fetching 209 files: 0%\|▎ \| 1/209 [00:51<2:58:40, 51.54s/it]\n",
	"\n",
	"KeyboardInterrupt\n",
	"\n"
	]
	}
	],
	"source": [
	"from huggingface_hub import snapshot_download\n",
	"snapshot_download(repo_id=\"hendrick-chong-02/malaysian-chinese-youtube\", \n",
	" allow_patterns=[\"2025-02-06 01:30:41/*\"],\n",
	" repo_type = 'dataset', local_dir = './')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"id": "016b203a-10fd-4419-9e4d-a7e9e10f1035",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"59"
	]
	},
	"execution_count": 44,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"from glob import glob\n",
	"from torch.utils.data import DataLoader, Dataset\n",
	"from datasets import Audio\n",
	"import pyarrow.parquet as pq\n",
	"\n",
	"def get_parquet_row_count(file_path):\n",
	" parquet_file = pq.ParquetFile(file_path)\n",
	" return parquet_file.metadata.num_rows\n",
	" \n",
	"files = sorted(glob('/.parquet'))\n",
	"len(files)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"id": "6199b3a5-5d64-4f8b-ad19-3a85ce3fb130",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 9.11 ms, sys: 3.13 ms, total: 12.2 ms\n",
	"Wall time: 11.3 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"\n",
	"global_indices = {}\n",
	"start = 0\n",
	"for f in files:\n",
	" row_size = get_parquet_row_count(f)\n",
	" row = {\n",
	" 'start': start,\n",
	" 'end': row_size,\n",
	" 'filename': f\n",
	" }\n",
	" row['start'] = start\n",
	" row['end'] = row_size\n",
	" global_indices[start] = row\n",
	" start += row_size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"id": "40dafe18-526f-448d-b913-3108e0bc8936",
	"metadata": {},
	"outputs": [],
	"source": [
	"class ParquetDataset(Dataset):\n",
	" def __init__(self, indices, maxlen_cache_df=5):\n",
	" self.indices = {}\n",
	" for k, v in indices.items():\n",
	" for i in range(int(k), v['start'] + v['end'], 1):\n",
	" self.indices[i] = v\n",
	" \n",
	" self.max_index = len(self.indices)\n",
	" self.cache_df = {}\n",
	" self.maxlen_cache_df = maxlen_cache_df\n",
	" self.audio = Audio(sampling_rate=16000)\n",
	" \n",
	" def __len__(self):\n",
	" return self.max_index\n",
	" \n",
	" def __getitem__(self, item):\n",
	" if item < 0:\n",
	" item = self.max_index + item\n",
	"\n",
	" v = self.indices[item]\n",
	" chunk_index = item - v['start']\n",
	" if v['filename'] not in self.cache_df:\n",
	" df = pd.read_parquet(v['filename'])\n",
	" if len(self.cache_df) >= self.maxlen_cache_df:\n",
	" keys = list(self.cache_df.keys())\n",
	" self.cache_df.pop(sorted(keys)[0], None)\n",
	" self.cache_df[v['filename']] = df\n",
	" else:\n",
	" df = self.cache_df[v['filename']]\n",
	"\n",
	" row = df.iloc[chunk_index].to_dict()\n",
	" audio = k = self.audio.decode_example(self.audio.encode_example(row['audio']))['array']\n",
	" row['audio'] = audio\n",
	" return row"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 61,
	"id": "247b9abd-5dfb-47f2-bec4-76901cb6321f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"35833"
	]
	},
	"execution_count": 61,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset = ParquetDataset(global_indices)\n",
	"len(dataset)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"id": "d7bcc900-f8e8-4c52-ad1f-4fd044194340",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 223 ms, sys: 1.56 s, total: 1.78 s\n",
	"Wall time: 812 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"{'audio': array([-0.00125122, -0.00079346, 0. , ..., -0.00183105,\n",
	" 0.00012207, 0.00296021]),\n",
	" 'video id': '18RUjXWtV28',\n",
	" 'chunk number': 0,\n",
	" 'transcription': '我前,身份在身边好啦,是Good morning!今天是一个非常开心的一天看我春风满面就知道最近发生了两件很重要的事情要跟大家分享No.1Hepi在上个星期拿了人生第一张Driving license那一个下午当下我直接去买男生第一辆车买了新车回到家的第一个moment我打开门发生第二件很值得开心的事情Hepi一进家就发现一个包裹一打开包裹你知道是什么吗竟然是全新的Sony A7C',\n",
	" 'timestamp': \"[{'timestamp': (0.0, 1.54), 'text': '我前,身份在身边'}, {'timestamp': (1.54, 2.3), 'text': '好啦,是'}, {'timestamp': (2.3, 3.6), 'text': 'Good morning!'}, {'timestamp': (3.6, 5.3), 'text': '今天是一个非常开心的一天'}, {'timestamp': (5.3, 7.6), 'text': '看我春风满面就知道最近发生了'}, {'timestamp': (7.6, 9.6), 'text': '两件很重要的事情要跟大家分享'}, {'timestamp': (9.6, 10.14), 'text': 'No.1'}, {'timestamp': (10.14, 11.06), 'text': 'Hepi在上个星期'}, {'timestamp': (11.06, 12.56), 'text': '拿了人生第一张Driving license'}, {'timestamp': (12.56, 13.34), 'text': '那一个下午'}, {'timestamp': (13.34, 14.06), 'text': '当下'}, {'timestamp': (14.06, 14.64), 'text': '我直接'}, {'timestamp': (14.64, 16.34), 'text': '去买男生第一辆车'}, {'timestamp': (16.34, 18.24), 'text': '买了新车回到家的第一个moment'}, {'timestamp': (18.24, 19.14), 'text': '我打开门'}, {'timestamp': (19.14, 21.14), 'text': '发生第二件很值得开心的事情'}, {'timestamp': (21.14, 22.7), 'text': 'Hepi一进家就发现一个包裹'}, {'timestamp': (22.7, 24.44), 'text': '一打开包裹你知道是什么吗'}, {'timestamp': (24.44, 25.44), 'text': '竟然是'}, {'timestamp': (25.44, 28.02), 'text': '全新的Sony A7C'}]\"}"
	]
	},
	"execution_count": 62,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"\n",
	"dataset[1000]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"id": "0fca609b-38ad-4086-853b-5205393c1e41",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4.19 ms, sys: 2.22 ms, total: 6.41 ms\n",
	"Wall time: 5.41 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"{'audio': array([ 7.32421875e-04, -6.10351562e-05, -1.12915039e-03, ...,\n",
	" -5.13000488e-02, -4.47082520e-02, -3.10974121e-02]),\n",
	" 'video id': '18RUjXWtV28',\n",
	" 'chunk number': 1,\n",
	" 'transcription': '我只能講我太開心Sony來得太對時機了所以這一次的VlogHabby決定使用這台全新的A7C一邊拍攝一邊開上我的新車哈哈這樣子的話你可以在同一支影片裡面看到兩個的小寶貝這樣啊Ida cutBefore看車之前呢Habby先把今天這一集最廢的嘉賓請出來她是人稱書棒區美女收割機很像換女朋友喂喂哈哈哈哈女朋友你刷Credit看來會快書家說好壞',\n",
	" 'timestamp': \"[{'timestamp': (0.0, 1.6), 'text': '我只能講我太開心'}, {'timestamp': (1.6, 3.76), 'text': 'Sony來得太對時機了'}, {'timestamp': (3.76, 4.68), 'text': '所以這一次的Vlog'}, {'timestamp': (4.68, 7.5), 'text': 'Habby決定使用這台全新的A7C'}, {'timestamp': (7.5, 8.3), 'text': '一邊拍攝'}, {'timestamp': (8.3, 9.76), 'text': '一邊開上我的新車'}, {'timestamp': (9.76, 10.34), 'text': '哈哈'}, {'timestamp': (10.34, 11.0), 'text': '這樣子的話'}, {'timestamp': (11.0, 12.96), 'text': '你可以在同一支影片裡面看到'}, {'timestamp': (12.96, 14.5), 'text': '兩個的小寶貝'}, {'timestamp': (14.5, 15.2), 'text': '這樣啊'}, {'timestamp': (15.2, 15.94), 'text': 'Ida cut'}, {'timestamp': (15.94, 17.26), 'text': 'Before看車之前呢'}, {'timestamp': (17.26, 18.9), 'text': 'Habby先把今天這一集'}, {'timestamp': (18.9, 20.2), 'text': '最廢的嘉賓請出來'}, {'timestamp': (20.2, 23.2), 'text': '她是人稱書棒區美女收割機'}, {'timestamp': (23.2, 24.3), 'text': '很像換女朋友'}, {'timestamp': (24.3, 24.66), 'text': '喂喂'}, {'timestamp': (24.66, 25.44), 'text': '哈哈哈哈'}, {'timestamp': (25.44, 26.0), 'text': '女朋友'}, {'timestamp': (26.0, 27.2), 'text': '你刷Credit看來會快'}, {'timestamp': (27.2, 28.14), 'text': '書家說好壞'}]\"}"
	]
	},
	"execution_count": 63,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"\n",
	"dataset[1001]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3a5681b7-91e7-4f55-a9dc-8c1db4439bc7",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}