123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import pdfplumber
- import pandas as pd
- import numpy as np;
- basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/";
- pdf_path = basePath+"闻海雁532329198801060347.pdf"
- # pdf_path = basePath+"雷雨晴130630199006130027.pdf"
- # pdf_path=basePath+"杨安140402197102111236.pdf"
- # pdf_path=basePath+"刘盼兰130133198912261210.pdf"
- # pdf_path=basePath+"马维强130521198604045272.pdf"
- pdf_path=basePath+"郑晨晨130681199008205811.pdf"
- # pdf_path = basePath + "人行征信模拟数据报告.pdf"
- reportTime=None;
- reportTimeCn="报告时间:"
- queryreq = {}
- identity = {}
- queryreqDf=pd.DataFrame;#被查询信息
- identityDf=pd.DataFrame;#
- addressDf=pd.DataFrame;
- mateDf=pd.DataFrame;
- CREDITSUMMARYCUE_df = pd.DataFrame;
- FIRSTLOANOPENMONTH_df=pd.DataFrame;
- FIRSTLOANCARDOPENMONTH_df=pd.DataFrame;
- FIRSTLOANOPENMONTH = "";#首笔贷款账龄
- FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄
- OVERDUESUM_df=pd.DataFrame;#逾期透支信息汇总
- SHAREANDDEBT_loan_df=pd.DataFrame;#非循环贷账户信息汇总
- SHAREANDDEBT_creditCard_df=pd.DataFrame;#非循环贷账户信息汇总
- RECORDSUMMARY_df = pd.DataFrame;#查询记录汇总_个人征信
- RECORDDETAIL_df = pd.DataFrame;#查询记录明细
- def parseQueryreq(reportTime):
- queryreq["reportTime"]=reportTime.split(":")[1];
- headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"];
- header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱']
- header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None]
- header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话']
- header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份']
- header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数']
- header_SHAREANDDEBT_loan=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款']
- header_SHAREANDDEBT_creditCard=['发卡机构数', '账户数', '授信总额', '单家机构最高\n授信额', '单家机构最低\n授信额', '已用额度', '最近6个月平\n均使用额度']
- header_RECORDSUMMARY=['贷款审批', '信用卡审批', '贷款审批', '信用卡\n审批', '本人查询', '贷后管理', '担保资格\n审查', '特约商户\n实名审查']
- header_RECORDDETAIL=['查询日期', '查询机构', '查询原因'] #查询记录明细
- headers = [0] * 100
- ignoreText=["异议信息提示","信息主体"]
- def isIgnore(text):
- for txt in ignoreText:
- if text.find(txt)>=0:
- return True;
- return False;
- #根据列确定表头
- #判断条件需要修改
- def hasHeader(headers,rows):
- text = rows[0]
- for header in headers:
- if header == text:
- return True
- return False;
- def hasHeaderIndex(headers,rows,index):
- text = rows[index]
- for header in headers:
- if header == text:
- return True
- return False;
- def headerOf(headers,entityHeader):
- for header in headers:
- if str(entityHeader).find(header)>=0:
- return True
- return False;
- def getMinMonth(FIRSTLOANOPENMONTH_df):
- FIRSTLOANOPENMONTH_values = [];
- FIRSTLOANOPENMONTH = ""
- for value in FIRSTLOANOPENMONTH_df.values:
- if value == '--':
- continue
- FIRSTLOANOPENMONTH_values.append(value)
- if len(FIRSTLOANOPENMONTH_values)>0:
- far = np.array(FIRSTLOANOPENMONTH_values);
- findex = np.argmax(far)
- FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0];
- return FIRSTLOANOPENMONTH;
- with pdfplumber.open(pdf_path) as pdf:
- for page in pdf.pages:
- # first_page = pdf.pages[1]
- for table in page.extract_tables():
- df = pd.DataFrame(table)
- # 第一列当成表头:
- # df = pd.DataFrame(table[1:],columns=table[0])
- # print(df)
- for rows in table:
- print(rows)
- # if col.find(reportTimeCn)>=0:
- # reportTime = col;
- # parseQueryreq(reportTime)
- # print(queryreq)
- # if hasHeader(headers_queryreq,rows):
- # queryreqDf = df;
- # # print(queryreqDf)
- # break;
- # elif hasHeader(header_identity,rows):
- # identityDf = df[:2];
- # # print(identityDf)
- # # addressDf = pd.DataFrame(table,columns=header_address)[['通讯地址','户籍地址']]
- # addressDf = df.iloc[2:4,[0,5]]
- # break;
- # elif hasHeader(header_mate,rows):
- # mateDf = df;
- # break;
- # elif hasHeader(header_CREDITSUMMARYCUE,rows):
- # CREDITSUMMARYCUE_df = df;
- # #首笔贷款发放月份,所有行第三列
- # #CREDITSUMMARYCUE_df[~CREDITSUMMARYCUE_df[1:4].iloc[:,[3]].isin(['--'])]
- # FIRSTLOANOPENMONTH_df=CREDITSUMMARYCUE_df[1:4].iloc[:,[3]]
- # FIRSTLOANOPENMONTH = getMinMonth(FIRSTLOANOPENMONTH_df)
- # FIRSTLOANCARDOPENMONTH_df = CREDITSUMMARYCUE_df[4:6].iloc[:,[3]]
- # FIRSTLOANCARDOPENMONTH = getMinMonth(FIRSTLOANCARDOPENMONTH_df)
- # print("贷款账龄(月数):"+FIRSTLOANOPENMONTH);
- # print("信用卡账龄(月数):" + FIRSTLOANCARDOPENMONTH);
- # #.min
- # break;
- # elif hasHeader(header_CREDITSUMMARYCUE,rows):#信贷交易信息提示
- # CREDITSUMMARYCUE_df = df;
- # elif hasHeader(header_SHAREANDDEBT_loan,rows):# 非循环贷账户信息汇总
- # SHAREANDDEBT_loan_df = df;
- # elif hasHeader(header_SHAREANDDEBT_creditCard,rows):# 非循环贷账户信息汇总
- # SHAREANDDEBT_creditCard_df = df;
- # elif hasHeader(header_RECORDSUMMARY,rows):# 非循环贷账户信息汇总
- # RECORDSUMMARY_df = df;
- # elif len(rows)>1 and hasHeaderIndex(header_RECORDDETAIL,rows,1):# 查询记录明细
- # RECORDDETAIL_df = df;
- # print(RECORDDETAIL_df)
|