ikegami-yukino · October 19, 2024 17:29
diff --git a/gil_pandas.py b/gil_pandas.py
 import os
 import time
 import urllib.request
 import zipfile

 import pandas as pd
 from pandarallel import pandarallel

 URL = "https://www.post.japanpost.jp/zipcode/dl/utf/zip/utf_ken_all.zip"
 ZIP_FILE_NAME = URL.split("/")[-1]
 TARGET_CSV_NAME = "utf_ken_all.csv"
 ITERATIONS = 20

 pandarallel.initialize()


 def to_hiragana_map(x):
    import jaconv
    x = jaconv.kata2hira(jaconv.h2z(x))
    return x


 def to_hiragana_apply(x):
    import jaconv
    for i in range(3, 6):
        x[i] = jaconv.kata2hira(jaconv.h2z(x[i]))
    return x


 if __name__ == "__main__":
    if not os.path.exists(ZIP_FILE_NAME):
        urllib.request.urlretrieve(URL, ZIP_FILE_NAME)

    if not os.path.exists(TARGET_CSV_NAME):
        with zipfile.ZipFile(ZIP_FILE_NAME) as zf:
            zf.extract(TARGET_CSV_NAME, "./")

    df = pd.read_csv(TARGET_CSV_NAME, header=None, dtype="object")

    # データ量を10倍にかさまし
    df = pd.concat([df for _ in range(10)])

    elapsed_times = []
    for _ in range(ITERATIONS):
        start_time = time.monotonic()

        df[3] = df[3].map(to_hiragana_map)
        df[4] = df[4].map(to_hiragana_map)
        df[5] = df[5].map(to_hiragana_map)

        elapsed_times.append(time.monotonic() - start_time)

    print(f"[map] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

    elapsed_times = []
    for _ in range(ITERATIONS):
        start_time = time.monotonic()

        df[3] = df[3].parallel_map(to_hiragana_map)
        df[4] = df[4].parallel_map(to_hiragana_map)
        df[5] = df[5].parallel_map(to_hiragana_map)

        elapsed_times.append(time.monotonic() - start_time)

    print(f"[parallel_map] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

    elapsed_times = []
    for _ in range(ITERATIONS):
        start_time = time.monotonic()
        df = df.apply(to_hiragana_apply, axis=1)
        elapsed_times.append(time.monotonic() - start_time)

    print(f"[apply] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

    elapsed_times = []
    for _ in range(ITERATIONS):
        start_time = time.monotonic()
        df = df.parallel_apply(to_hiragana_apply, axis=1)
        elapsed_times.append(time.monotonic() - start_time)

    print(f"[parallel_apply] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")
	import os
	import time
	import urllib.request
	import zipfile

	import pandas as pd
	from pandarallel import pandarallel

	URL = "https://www.post.japanpost.jp/zipcode/dl/utf/zip/utf_ken_all.zip"
	ZIP_FILE_NAME = URL.split("/")[-1]
	TARGET_CSV_NAME = "utf_ken_all.csv"
	ITERATIONS = 20

	pandarallel.initialize()


	def to_hiragana_map(x):
	import jaconv
	x = jaconv.kata2hira(jaconv.h2z(x))
	return x


	def to_hiragana_apply(x):
	import jaconv
	for i in range(3, 6):
	x[i] = jaconv.kata2hira(jaconv.h2z(x[i]))
	return x


	if __name__ == "__main__":
	if not os.path.exists(ZIP_FILE_NAME):
	urllib.request.urlretrieve(URL, ZIP_FILE_NAME)

	if not os.path.exists(TARGET_CSV_NAME):
	with zipfile.ZipFile(ZIP_FILE_NAME) as zf:
	zf.extract(TARGET_CSV_NAME, "./")

	df = pd.read_csv(TARGET_CSV_NAME, header=None, dtype="object")

	# データ量を10倍にかさまし
	df = pd.concat([df for _ in range(10)])

	elapsed_times = []
	for _ in range(ITERATIONS):
	start_time = time.monotonic()

	df[3] = df[3].map(to_hiragana_map)
	df[4] = df[4].map(to_hiragana_map)
	df[5] = df[5].map(to_hiragana_map)

	elapsed_times.append(time.monotonic() - start_time)

	print(f"[map] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

	elapsed_times = []
	for _ in range(ITERATIONS):
	start_time = time.monotonic()

	df[3] = df[3].parallel_map(to_hiragana_map)
	df[4] = df[4].parallel_map(to_hiragana_map)
	df[5] = df[5].parallel_map(to_hiragana_map)

	elapsed_times.append(time.monotonic() - start_time)

	print(f"[parallel_map] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

	elapsed_times = []
	for _ in range(ITERATIONS):
	start_time = time.monotonic()
	df = df.apply(to_hiragana_apply, axis=1)
	elapsed_times.append(time.monotonic() - start_time)

	print(f"[apply] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")

	elapsed_times = []
	for _ in range(ITERATIONS):
	start_time = time.monotonic()
	df = df.parallel_apply(to_hiragana_apply, axis=1)
	elapsed_times.append(time.monotonic() - start_time)

	print(f"[parallel_apply] Average elapsed time: {sum(elapsed_times) / ITERATIONS}")