123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- import pdfplumber
- import pandas as pd
- import numpy as np;
- basePath = "D:/";
- # pdf_path = basePath+"闻海雁532329198801060347(2).pdf"
- basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/";
- pdf_path = basePath+"雷雨晴130630199006130027.pdf"
- # pdf_path=basePath+"杨安140402197102111236.pdf"
- # pdf_path=basePath+"刘盼兰130133198912261210.pdf"
- # pdf_path=basePath+"马维强130521198604045272.pdf"
- # pdf_path=basePath+"郑晨晨130681199008205811.pdf"
- reportTime=None;
- reportTimeCn="报告时间:"
- queryreq = {}
- identity = {}
- queryreqDf=pd.DataFrame;
- identityDf=pd.DataFrame;
- addressDf=pd.DataFrame;
- mateDf=pd.DataFrame;
- CREDITSUMMARYCUE_df = pd.DataFrame
- FIRSTLOANOPENMONTH_df=pd.DataFrame
- FIRSTLOANCARDOPENMONTH_df=pd.DataFrame;
- FIRSTLOANOPENMONTH = "";#首笔贷款账龄
- FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄
- OVERDUESUM_df=pd.DataFrame;
- def parseQueryreq(reportTime):
- queryreq["reportTime"]=reportTime.split(":")[1];
- headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"];
- header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱']
- header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None]
- header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话']
- header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份']
- header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数']
- header_SHAREANDDEBT=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款']
- headers = [0] * 100
- ignoreText=["异议信息提示","信息主体"]
- def isIgnore(text):
- for txt in ignoreText:
- if text.find(txt)>=0:
- return True;
- return False;
- #根据列确定表头
- #判断条件需要修改
- def hasHeader(headers,rows):
- text = rows[0]
- for header in headers:
- if header == text:
- return True
- return False;
- def headerOf(headers,entityHeader):
- for header in headers:
- if str(entityHeader).find(header)>=0:
- return True
- return False;
- def getMinMonth(FIRSTLOANOPENMONTH_df):
- FIRSTLOANOPENMONTH_values = [];
- FIRSTLOANOPENMONTH = ""
- for value in FIRSTLOANOPENMONTH_df.values:
- if value == '--':
- continue
- FIRSTLOANOPENMONTH_values.append(value)
- if len(FIRSTLOANOPENMONTH_values)>0:
- far = np.array(FIRSTLOANOPENMONTH_values);
- findex = np.argmax(far)
- FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0];
- return FIRSTLOANOPENMONTH;
- pd.Series()
- with pdfplumber.open(pdf_path) as pdf:
- for page in pdf.pages:
- # first_page = pdf.pages[1]
- for table in page.extract_tables():
- df = pd.DataFrame(table)
- # 第一列当成表头:
- # df = pd.DataFrame(table[1:],columns=table[0])
- tmp=df.loc[0,:].tolist()
- tmp2=['业务类型',None,'账户数','首笔业务发放月份']
- #判断表头
- if tmp==tmp2 :
- #要处理的一段数据
- dfData=df[df[3]!='--'] #过滤
- print(dfData[3].min())
- print("$$$$$$$$$$$$$$$")
- #@print(df)
|