import pdfplumber import pandas as pd import numpy as np; basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/"; pdf_path = basePath+"闻海雁532329198801060347.pdf" # pdf_path = basePath+"雷雨晴130630199006130027.pdf" # pdf_path=basePath+"杨安140402197102111236.pdf" # pdf_path=basePath+"刘盼兰130133198912261210.pdf" # pdf_path=basePath+"马维强130521198604045272.pdf" pdf_path=basePath+"郑晨晨130681199008205811.pdf" # pdf_path = basePath + "人行征信模拟数据报告.pdf" reportTime=None; reportTimeCn="报告时间:" queryreq = {} identity = {} queryreqDf=pd.DataFrame;#被查询信息 identityDf=pd.DataFrame;# addressDf=pd.DataFrame; mateDf=pd.DataFrame; CREDITSUMMARYCUE_df = pd.DataFrame; FIRSTLOANOPENMONTH_df=pd.DataFrame; FIRSTLOANCARDOPENMONTH_df=pd.DataFrame; FIRSTLOANOPENMONTH = "";#首笔贷款账龄 FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄 OVERDUESUM_df=pd.DataFrame;#逾期透支信息汇总 SHAREANDDEBT_loan_df=pd.DataFrame;#非循环贷账户信息汇总 SHAREANDDEBT_creditCard_df=pd.DataFrame;#非循环贷账户信息汇总 RECORDSUMMARY_df = pd.DataFrame;#查询记录汇总_个人征信 RECORDDETAIL_df = pd.DataFrame;#查询记录明细 def parseQueryreq(reportTime): queryreq["reportTime"]=reportTime.split(":")[1]; headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"]; header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱'] header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None] header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话'] header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份'] header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数'] header_SHAREANDDEBT_loan=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款'] header_SHAREANDDEBT_creditCard=['发卡机构数', '账户数', '授信总额', '单家机构最高\n授信额', '单家机构最低\n授信额', '已用额度', '最近6个月平\n均使用额度'] header_RECORDSUMMARY=['贷款审批', '信用卡审批', '贷款审批', '信用卡\n审批', '本人查询', '贷后管理', '担保资格\n审查', '特约商户\n实名审查'] header_RECORDDETAIL=['查询日期', '查询机构', '查询原因'] #查询记录明细 headers = [0] * 100 ignoreText=["异议信息提示","信息主体"] def isIgnore(text): for txt in ignoreText: if text.find(txt)>=0: return True; return False; #根据列确定表头 #判断条件需要修改 def hasHeader(headers,rows): text = rows[0] for header in headers: if header == text: return True return False; def hasHeaderIndex(headers,rows,index): text = rows[index] for header in headers: if header == text: return True return False; def headerOf(headers,entityHeader): for header in headers: if str(entityHeader).find(header)>=0: return True return False; def getMinMonth(FIRSTLOANOPENMONTH_df): FIRSTLOANOPENMONTH_values = []; FIRSTLOANOPENMONTH = "" for value in FIRSTLOANOPENMONTH_df.values: if value == '--': continue FIRSTLOANOPENMONTH_values.append(value) if len(FIRSTLOANOPENMONTH_values)>0: far = np.array(FIRSTLOANOPENMONTH_values); findex = np.argmax(far) FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0]; return FIRSTLOANOPENMONTH; with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: # first_page = pdf.pages[1] for table in page.extract_tables(): df = pd.DataFrame(table) # 第一列当成表头: # df = pd.DataFrame(table[1:],columns=table[0]) # print(df) for rows in table: print(rows) # if col.find(reportTimeCn)>=0: # reportTime = col; # parseQueryreq(reportTime) # print(queryreq) # if hasHeader(headers_queryreq,rows): # queryreqDf = df; # # print(queryreqDf) # break; # elif hasHeader(header_identity,rows): # identityDf = df[:2]; # # print(identityDf) # # addressDf = pd.DataFrame(table,columns=header_address)[['通讯地址','户籍地址']] # addressDf = df.iloc[2:4,[0,5]] # break; # elif hasHeader(header_mate,rows): # mateDf = df; # break; # elif hasHeader(header_CREDITSUMMARYCUE,rows): # CREDITSUMMARYCUE_df = df; # #首笔贷款发放月份,所有行第三列 # #CREDITSUMMARYCUE_df[~CREDITSUMMARYCUE_df[1:4].iloc[:,[3]].isin(['--'])] # FIRSTLOANOPENMONTH_df=CREDITSUMMARYCUE_df[1:4].iloc[:,[3]] # FIRSTLOANOPENMONTH = getMinMonth(FIRSTLOANOPENMONTH_df) # FIRSTLOANCARDOPENMONTH_df = CREDITSUMMARYCUE_df[4:6].iloc[:,[3]] # FIRSTLOANCARDOPENMONTH = getMinMonth(FIRSTLOANCARDOPENMONTH_df) # print("贷款账龄(月数):"+FIRSTLOANOPENMONTH); # print("信用卡账龄(月数):" + FIRSTLOANCARDOPENMONTH); # #.min # break; # elif hasHeader(header_CREDITSUMMARYCUE,rows):#信贷交易信息提示 # CREDITSUMMARYCUE_df = df; # elif hasHeader(header_SHAREANDDEBT_loan,rows):# 非循环贷账户信息汇总 # SHAREANDDEBT_loan_df = df; # elif hasHeader(header_SHAREANDDEBT_creditCard,rows):# 非循环贷账户信息汇总 # SHAREANDDEBT_creditCard_df = df; # elif hasHeader(header_RECORDSUMMARY,rows):# 非循环贷账户信息汇总 # RECORDSUMMARY_df = df; # elif len(rows)>1 and hasHeaderIndex(header_RECORDDETAIL,rows,1):# 查询记录明细 # RECORDDETAIL_df = df; # print(RECORDDETAIL_df)