Last active
November 20, 2020 05:32
-
-
Save tsudoko/1d3e947aae6a909110b393ebdd8d61c2 to your computer and use it in GitHub Desktop.
Official kanji list scrapers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__pycache__ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
from pdfminer import layout | |
import utils | |
# ref: https://web.archive.org/web/20170708205442/http://www.mext.go.jp:80/component/a_menu/education/micro_detail/__icsFiles/afieldfile/2017/05/12/1384661_4_2.pdf | |
debug_chars = "" | |
def contains(haystack, needle): | |
if isinstance(haystack, bytes): | |
return needle.encode() in haystack | |
else: | |
return needle in haystack | |
def pdf_process(page): | |
chars = {} | |
x1, x2 = (140, 510) | |
ys = [ | |
(720, 690), | |
(640, 550), | |
(520, 400), | |
(380, 270), | |
(240, 130), | |
(100, 90), | |
] | |
if page.pageid != 32: | |
gradebounds = [(x1, y1, x2, y2) for y1, y2 in ys] | |
else: | |
gradebounds = [(x1, 999, x2, 0)] | |
for o in page: | |
if not isinstance(o, layout.LTChar): | |
continue | |
if not contains(o.fontname, "KyoikuKanji"): | |
continue | |
for i, bounds in enumerate(gradebounds): | |
if page.pageid == 32: | |
i = 5 | |
bx1, by1, bx2, by2 = bounds | |
x1, y1, x2, y2 = o.bbox | |
if o.get_text() in debug_chars: | |
print(o.get_text()) | |
print(o) | |
print(o.bbox) | |
print(x1 < bx1) | |
print(y1 > by1) | |
print(x2 > bx2) | |
print(y2 < by2) | |
if x1 < bx1 or y1 > by1 or x2 > bx2 or y2 < by2: | |
continue | |
char = o.get_text() | |
if i not in chars: | |
chars[i] = "" | |
chars[i] += char | |
return chars | |
def pdf(f): | |
kanji = {} | |
for p in utils.pages(f): | |
if p.pageid < 31 or p.pageid > 32: | |
continue | |
elif p.pageid != 32: | |
kanji = pdf_process(p) | |
else: | |
kanji[5] += pdf_process(p)[5] | |
#return [v for k, v in sorted(kanji.items())] | |
print(kanji) | |
return [kanji[i] for i in range(len(kanji))] | |
def pdf_ok(out): | |
assert len(out[0]) == 80 | |
assert len(out[1]) == 160 | |
assert len(out[2]) == 200 | |
assert len(out[3]) == 202 | |
assert len(out[4]) == 193 | |
assert len(out[5]) == 191 | |
if __name__ == "__main__": | |
with open(sys.argv[1], "rb") as f: | |
kanji = pdf(f) | |
for k, v in enumerate(kanji): | |
print(f"{k+1}\t{v}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import math | |
import sys | |
from pdfminer import layout | |
import utils | |
# ref: https://web.archive.org/web/20180829192608/http://www.kanken.or.jp/kanken/outline/data/outline_degree_national_list.pdf | |
debug_chars = '' | |
def pdf_process(page): | |
skip = False | |
cont = True | |
chars = "" | |
level = "" | |
# tables on continued pages (4級 その2 etc.) are wider | |
bounds = (60, 741, 525, 84) | |
for o in page: | |
if not isinstance(o, layout.LTChar): | |
continue | |
if o.get_text() == '〈': | |
# regular page | |
x1, y1, x2, y2 = bounds | |
x1 = 49 | |
x2 = 488 | |
bounds = (x1, y1, x2, y2) | |
break | |
elif o.get_text() == '※': | |
# table on the last page is thinner | |
x1, y1, x2, y2 = bounds | |
x1 = 105 | |
x2 = 528 | |
bounds = (x1, y1, x2, y2) | |
break | |
for o in page: | |
if not isinstance(o, layout.LTChar): | |
continue | |
if math.floor(o.bbox[1]) == 782: | |
level += o.get_text() | |
if level and level[-1] == '級': | |
break | |
for o in page: | |
if not isinstance(o, layout.LTChar): | |
continue | |
bx1, by1, bx2, by2 = bounds | |
x1, y1, x2, y2 = o.bbox | |
if o.get_text() in debug_chars: | |
print(o.get_text()) | |
print(o.bbox) | |
print(x1 < bx1) | |
print(y1 > by1) | |
print(x2 > bx2) | |
print(y2 < by2) | |
if x1 < bx1 or y1 > by1 or x2 > bx2 or y2 < by2: | |
continue | |
char = o.get_text() | |
if char in "〔〕" or char in chars: | |
continue | |
chars += char | |
return level, chars | |
def pdf(f): | |
kanken = collections.OrderedDict() | |
for p in utils.pages(f): | |
k, v = pdf_process(p) | |
if not k: | |
continue | |
if k not in kanken: | |
kanken[k] = '' | |
kanken[k] += v | |
return kanken | |
def pdf_ok(out): | |
assert len(out['10級']) == 80 | |
assert len(out['9級']) == 160 | |
assert len(out['8級']) == 200 | |
assert len(out['7級']) == 200 | |
assert len(out['6級']) == 185 | |
assert len(out['5級']) == 181 | |
assert len(out['4級']) == 316 | |
assert len(out['3級']) == 285 | |
assert len(out['準2級']) == 333 | |
assert len(out['2級']) == 196 | |
if __name__ == "__main__": | |
with open(sys.argv[1], "rb") as f: | |
kanken = pdf(f) | |
for k, v in kanken.items(): | |
print(f"{k}\t{v}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pdfminer import pdfdocument, pdfparser, pdfinterp, pdfpage, converter, layout | |
def pages(f): | |
rm = pdfinterp.PDFResourceManager() | |
dev = converter.PDFPageAggregator(rm) | |
interp = pdfinterp.PDFPageInterpreter(rm, dev) | |
for p in pdfpage.PDFPage.get_pages(f): | |
interp.process_page(p) | |
yield dev.get_result() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment