parsePDF.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import pdfplumber
  2. import pandas as pd
  3. import numpy as np;
  4. basePath = "D:/";
  5. # pdf_path = basePath+"闻海雁532329198801060347(2).pdf"
  6. basePath = "D:/mydocument/myproject/git/busscredit/Crerdai/";
  7. pdf_path = basePath+"雷雨晴130630199006130027.pdf"
  8. # pdf_path=basePath+"杨安140402197102111236.pdf"
  9. # pdf_path=basePath+"刘盼兰130133198912261210.pdf"
  10. # pdf_path=basePath+"马维强130521198604045272.pdf"
  11. # pdf_path=basePath+"郑晨晨130681199008205811.pdf"
  12. reportTime=None;
  13. reportTimeCn="报告时间:"
  14. queryreq = {}
  15. identity = {}
  16. queryreqDf=pd.DataFrame;
  17. identityDf=pd.DataFrame;
  18. addressDf=pd.DataFrame;
  19. mateDf=pd.DataFrame;
  20. CREDITSUMMARYCUE_df = pd.DataFrame
  21. FIRSTLOANOPENMONTH_df=pd.DataFrame
  22. FIRSTLOANCARDOPENMONTH_df=pd.DataFrame;
  23. FIRSTLOANOPENMONTH = "";#首笔贷款账龄
  24. FIRSTLOANCARDOPENMONTH=""#首笔贷记卡账龄
  25. OVERDUESUM_df=pd.DataFrame;
  26. def parseQueryreq(reportTime):
  27. queryreq["reportTime"]=reportTime.split(":")[1];
  28. headers_queryreq = ["被查询者姓名","被查询者证件类型","被查询者证件号码","查询机构","查询原因"];
  29. header_identity=['性别', None, '出生日期', '婚姻状况', '学历', '学位', '就业状况', '国籍', '电子邮箱']
  30. header_address=['通讯地址', None, None, None, None, '户籍地址', None, None, None]
  31. header_mate = ['姓名', '证件类型', '证件号码', '工作单位', '联系电话']
  32. header_CREDITSUMMARYCUE=['业务类型', None, '账户数', '首笔业务发放月份']
  33. header_OVERDUESUM=['账户类型', '账户数', '月份数', '单月最高逾期/透支总额', '最长逾期/透支月数']
  34. header_SHAREANDDEBT=['管理机构数', '账户数', '授信总额', '余额', '最近6个月平均应还款']
  35. headers = [0] * 100
  36. ignoreText=["异议信息提示","信息主体"]
  37. def isIgnore(text):
  38. for txt in ignoreText:
  39. if text.find(txt)>=0:
  40. return True;
  41. return False;
  42. #根据列确定表头
  43. #判断条件需要修改
  44. def hasHeader(headers,rows):
  45. text = rows[0]
  46. for header in headers:
  47. if header == text:
  48. return True
  49. return False;
  50. def headerOf(headers,entityHeader):
  51. for header in headers:
  52. if str(entityHeader).find(header)>=0:
  53. return True
  54. return False;
  55. def getMinMonth(FIRSTLOANOPENMONTH_df):
  56. FIRSTLOANOPENMONTH_values = [];
  57. FIRSTLOANOPENMONTH = ""
  58. for value in FIRSTLOANOPENMONTH_df.values:
  59. if value == '--':
  60. continue
  61. FIRSTLOANOPENMONTH_values.append(value)
  62. if len(FIRSTLOANOPENMONTH_values)>0:
  63. far = np.array(FIRSTLOANOPENMONTH_values);
  64. findex = np.argmax(far)
  65. FIRSTLOANOPENMONTH = FIRSTLOANOPENMONTH_values[findex][0];
  66. return FIRSTLOANOPENMONTH;
  67. pd.Series()
  68. with pdfplumber.open(pdf_path) as pdf:
  69. for page in pdf.pages:
  70. # first_page = pdf.pages[1]
  71. for table in page.extract_tables():
  72. df = pd.DataFrame(table)
  73. # 第一列当成表头:
  74. # df = pd.DataFrame(table[1:],columns=table[0])
  75. tmp=df.loc[0,:].tolist()
  76. tmp2=['业务类型',None,'账户数','首笔业务发放月份']
  77. #判断表头
  78. if tmp==tmp2 :
  79. #要处理的一段数据
  80. dfData=df[df[3]!='--'] #过滤
  81. print(dfData[3].min())
  82. print("$$$$$$$$$$$$$$$")
  83. #@print(df)