Skip to content

Instantly share code, notes, and snippets.

@thulc
Created October 17, 2022 14:33
Show Gist options
  • Save thulc/7b3a8164e5fd974d963867a18be42541 to your computer and use it in GitHub Desktop.
Save thulc/7b3a8164e5fd974d963867a18be42541 to your computer and use it in GitHub Desktop.
convert all the tables in pdf to excels
from ctypes.util import find_library
print(find_library("gs"))
import camelot
# pdf path
path3 = '/Users/paul/Code/2015年中册_small.pdf'
# data save path
data_path='/Users/paul/PycharmProjects/pythonProject/pdf2excel/data/'
# 输入pdf文件的页面数
num_total = 345
# 指定每一批次处理的页面数
batch_size=30
bath_str_list=[]
for i in range(num_total // batch_size):
cur_before = i * batch_size + 1
cur_end = i * batch_size + batch_size
page_str = '{}-{}'.format(cur_before, cur_end)
bath_str_list.append(page_str)
if cur_end == (num_total-(num_total % batch_size)):
# 尾部处理
cur_before = cur_end + 1
cur_end = num_total
page_str = '{}-{}'.format(cur_before, cur_end)
bath_str_list.append(page_str)
for each in bath_str_list:
print('starting solve pages batch :'+each)
dfs = camelot.read_pdf(path3, pages=page_str, flavor="stream")
for j in range(len(dfs)):
print(dfs[j])
dfs[j].to_excel(data_path + page_str + "@" + str(j) + '.xlsx')
print(each + ' complete!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment