import pdfplumber import pandas as pd import numpy as np; basePath = "D:/"; # pdf_path = basePath+"闻海雁532329198801060347(2).pdf" basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/"; pdf_path = basePath+"雷雨晴130630199006130027.pdf" # pdf_path=basePath+"杨安140402197102111236.pdf" # pdf_path=basePath+"刘盼兰130133198912261210.pdf" # pdf_path=basePath+"马维强130521198604045272.pdf" # pdf_path=basePath+"郑晨晨130681199008205811.pdf" reportTime=None; reportTimeCn="报告时间:" queryreq = {} identity = {} queryreqDf=pd.DataFrame; identityDf=pd.DataFrame; addressDf=pd.DataFrame; mateDf=pd.DataFrame; CREDITSUMMARYCUE_df = pd.DataFrame FIRSTLOANOPENMONTH_df=pd.DataFrame FIRSTLOANCARDOPENMONTH_df=pd.DataFrame; FIRSTLOANOPENMONTH = "";#首笔贷款账龄 FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄 OVERDUESUM_df=pd.DataFrame; def parseQueryreq(reportTime): queryreq["reportTime"]=reportTime.split(":")[1]; headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"]; header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱'] header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None] header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话'] header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份'] header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数'] header_SHAREANDDEBT=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款'] headers = [0] * 100 ignoreText=["异议信息提示","信息主体"] def isIgnore(text): for txt in ignoreText: if text.find(txt)>=0: return True; return False; #根据列确定表头 #判断条件需要修改 def hasHeader(headers,rows): text = rows[0] for header in headers: if header == text: return True return False; def headerOf(headers,entityHeader): for header in headers: if str(entityHeader).find(header)>=0: return True return False; def getMinMonth(FIRSTLOANOPENMONTH_df): FIRSTLOANOPENMONTH_values = []; FIRSTLOANOPENMONTH = "" for value in FIRSTLOANOPENMONTH_df.values: if value == '--': continue FIRSTLOANOPENMONTH_values.append(value) if len(FIRSTLOANOPENMONTH_values)>0: far = np.array(FIRSTLOANOPENMONTH_values); findex = np.argmax(far) FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0]; return FIRSTLOANOPENMONTH; pd.Series() with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: # first_page = pdf.pages[1] for table in page.extract_tables(): df = pd.DataFrame(table) # 第一列当成表头: # df = pd.DataFrame(table[1:],columns=table[0]) tmp=df.loc[0,:].tolist() tmp2=['业务类型',None,'账户数','首笔业务发放月份'] #判断表头 if tmp==tmp2 : #要处理的一段数据 dfData=df[df[3]!='--'] #过滤 print(dfData[3].min()) print("$$$$$$$$$$$$$$$") #@print(df)