Created
October 17, 2022 14:33
-
-
Save thulc/7b3a8164e5fd974d963867a18be42541 to your computer and use it in GitHub Desktop.
convert all the tables in pdf to excels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ctypes.util import find_library | |
print(find_library("gs")) | |
import camelot | |
# pdf path | |
path3 = '/Users/paul/Code/2015年中册_small.pdf' | |
# data save path | |
data_path='/Users/paul/PycharmProjects/pythonProject/pdf2excel/data/' | |
# 输入pdf文件的页面数 | |
num_total = 345 | |
# 指定每一批次处理的页面数 | |
batch_size=30 | |
bath_str_list=[] | |
for i in range(num_total // batch_size): | |
cur_before = i * batch_size + 1 | |
cur_end = i * batch_size + batch_size | |
page_str = '{}-{}'.format(cur_before, cur_end) | |
bath_str_list.append(page_str) | |
if cur_end == (num_total-(num_total % batch_size)): | |
# 尾部处理 | |
cur_before = cur_end + 1 | |
cur_end = num_total | |
page_str = '{}-{}'.format(cur_before, cur_end) | |
bath_str_list.append(page_str) | |
for each in bath_str_list: | |
print('starting solve pages batch :'+each) | |
dfs = camelot.read_pdf(path3, pages=page_str, flavor="stream") | |
for j in range(len(dfs)): | |
print(dfs[j]) | |
dfs[j].to_excel(data_path + page_str + "@" + str(j) + '.xlsx') | |
print(each + ' complete!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment