testplumber.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. import pdfplumber
  2. import pandas as pd
  3. import numpy as np;
  4. basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/";
  5. pdf_path = basePath+"闻海雁532329198801060347.pdf"
  6. # pdf_path = basePath+"雷雨晴130630199006130027.pdf"
  7. # pdf_path=basePath+"杨安140402197102111236.pdf"
  8. # pdf_path=basePath+"刘盼兰130133198912261210.pdf"
  9. # pdf_path=basePath+"马维强130521198604045272.pdf"
  10. pdf_path=basePath+"郑晨晨130681199008205811.pdf"
  11. # pdf_path = basePath + "人行征信模拟数据报告.pdf"
  12. reportTime=None;
  13. reportTimeCn="报告时间:"
  14. queryreq = {}
  15. identity = {}
  16. queryreqDf=pd.DataFrame;#被查询信息
  17. identityDf=pd.DataFrame;#
  18. addressDf=pd.DataFrame;
  19. mateDf=pd.DataFrame;
  20. CREDITSUMMARYCUE_df = pd.DataFrame;
  21. FIRSTLOANOPENMONTH_df=pd.DataFrame;
  22. FIRSTLOANCARDOPENMONTH_df=pd.DataFrame;
  23. FIRSTLOANOPENMONTH = "";#首笔贷款账龄
  24. FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄
  25. OVERDUESUM_df=pd.DataFrame;#逾期透支信息汇总
  26. SHAREANDDEBT_loan_df=pd.DataFrame;#非循环贷账户信息汇总
  27. SHAREANDDEBT_creditCard_df=pd.DataFrame;#非循环贷账户信息汇总
  28. RECORDSUMMARY_df = pd.DataFrame;#查询记录汇总_个人征信
  29. RECORDDETAIL_df = pd.DataFrame;#查询记录明细
  30. def parseQueryreq(reportTime):
  31. queryreq["reportTime"]=reportTime.split(":")[1];
  32. headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"];
  33. header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱']
  34. header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None]
  35. header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话']
  36. header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份']
  37. header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数']
  38. header_SHAREANDDEBT_loan=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款']
  39. header_SHAREANDDEBT_creditCard=['发卡机构数', '账户数', '授信总额', '单家机构最高\n授信额', '单家机构最低\n授信额', '已用额度', '最近6个月平\n均使用额度']
  40. header_RECORDSUMMARY=['贷款审批', '信用卡审批', '贷款审批', '信用卡\n审批', '本人查询', '贷后管理', '担保资格\n审查', '特约商户\n实名审查']
  41. header_RECORDDETAIL=['查询日期', '查询机构', '查询原因'] #查询记录明细
  42. headers = [0] * 100
  43. ignoreText=["异议信息提示","信息主体"]
  44. def isIgnore(text):
  45. for txt in ignoreText:
  46. if text.find(txt)>=0:
  47. return True;
  48. return False;
  49. #根据列确定表头
  50. #判断条件需要修改
  51. def hasHeader(headers,rows):
  52. text = rows[0]
  53. for header in headers:
  54. if header == text:
  55. return True
  56. return False;
  57. def hasHeaderIndex(headers,rows,index):
  58. text = rows[index]
  59. for header in headers:
  60. if header == text:
  61. return True
  62. return False;
  63. def headerOf(headers,entityHeader):
  64. for header in headers:
  65. if str(entityHeader).find(header)>=0:
  66. return True
  67. return False;
  68. def getMinMonth(FIRSTLOANOPENMONTH_df):
  69. FIRSTLOANOPENMONTH_values = [];
  70. FIRSTLOANOPENMONTH = ""
  71. for value in FIRSTLOANOPENMONTH_df.values:
  72. if value == '--':
  73. continue
  74. FIRSTLOANOPENMONTH_values.append(value)
  75. if len(FIRSTLOANOPENMONTH_values)>0:
  76. far = np.array(FIRSTLOANOPENMONTH_values);
  77. findex = np.argmax(far)
  78. FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0];
  79. return FIRSTLOANOPENMONTH;
  80. with pdfplumber.open(pdf_path) as pdf:
  81. for page in pdf.pages:
  82. # first_page = pdf.pages[1]
  83. for table in page.extract_tables():
  84. df = pd.DataFrame(table)
  85. # 第一列当成表头:
  86. # df = pd.DataFrame(table[1:],columns=table[0])
  87. # print(df)
  88. for rows in table:
  89. print(rows)
  90. # if col.find(reportTimeCn)>=0:
  91. # reportTime = col;
  92. # parseQueryreq(reportTime)
  93. # print(queryreq)
  94. # if hasHeader(headers_queryreq,rows):
  95. # queryreqDf = df;
  96. # # print(queryreqDf)
  97. # break;
  98. # elif hasHeader(header_identity,rows):
  99. # identityDf = df[:2];
  100. # # print(identityDf)
  101. # # addressDf = pd.DataFrame(table,columns=header_address)[['通讯地址','户籍地址']]
  102. # addressDf = df.iloc[2:4,[0,5]]
  103. # break;
  104. # elif hasHeader(header_mate,rows):
  105. # mateDf = df;
  106. # break;
  107. # elif hasHeader(header_CREDITSUMMARYCUE,rows):
  108. # CREDITSUMMARYCUE_df = df;
  109. # #首笔贷款发放月份,所有行第三列
  110. # #CREDITSUMMARYCUE_df[~CREDITSUMMARYCUE_df[1:4].iloc[:,[3]].isin(['--'])]
  111. # FIRSTLOANOPENMONTH_df=CREDITSUMMARYCUE_df[1:4].iloc[:,[3]]
  112. # FIRSTLOANOPENMONTH = getMinMonth(FIRSTLOANOPENMONTH_df)
  113. # FIRSTLOANCARDOPENMONTH_df = CREDITSUMMARYCUE_df[4:6].iloc[:,[3]]
  114. # FIRSTLOANCARDOPENMONTH = getMinMonth(FIRSTLOANCARDOPENMONTH_df)
  115. # print("贷款账龄(月数):"+FIRSTLOANOPENMONTH);
  116. # print("信用卡账龄(月数):" + FIRSTLOANCARDOPENMONTH);
  117. # #.min
  118. # break;
  119. # elif hasHeader(header_CREDITSUMMARYCUE,rows):#信贷交易信息提示
  120. # CREDITSUMMARYCUE_df = df;
  121. # elif hasHeader(header_SHAREANDDEBT_loan,rows):# 非循环贷账户信息汇总
  122. # SHAREANDDEBT_loan_df = df;
  123. # elif hasHeader(header_SHAREANDDEBT_creditCard,rows):# 非循环贷账户信息汇总
  124. # SHAREANDDEBT_creditCard_df = df;
  125. # elif hasHeader(header_RECORDSUMMARY,rows):# 非循环贷账户信息汇总
  126. # RECORDSUMMARY_df = df;
  127. # elif len(rows)>1 and hasHeaderIndex(header_RECORDDETAIL,rows,1):# 查询记录明细
  128. # RECORDDETAIL_df = df;
  129. # print(RECORDDETAIL_df)