Features of pdfplumber
1. It is a pure python third-party library, suitable for python version
, It is used to view various PDF information, and can effectively extract text and tables
3. It does not support modifying or generating PDFs, nor does it support processing of PDF scans.
import glob import pdfplumber import re from collections import defaultdict import json class PDFProcessor: def __init__(self, filepath): = filepath #Open the document and pay attention to the storage location = (filepath) self.all_text = defaultdict(dict) = 0 self.last_num = 0 def check_lines(self, page, top, buttom): """ Used to inspect rows in the page and merge rows according to the given top and bottom positions. """ # Text data lines = page.extract_words()[::] text = '' last_top = 0 last_check = 0 for l in range(len(lines)): each_line = lines[l] check_re = '(?:。|;|unit:Yuan|unit:万Yuan|Currency:RMB|\d|Report(?:full text)?(?:(Revised version)|(Revised draft)|(After correction))?)$' if top == '' and buttom == '': if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] #elif last_check > 0 and ( * 0.85 - each_line['top']) > 0 and not (check_re, text): elif last_check > 0 and ( * 0.9 - each_line['top']) > 0 and not (check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] elif top == '': if each_line['top'] > buttom: if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] elif last_check > 0 and ( * 0.85 - each_line['top']) > 0 and not (check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] else: if each_line['top'] < top and each_line['top'] > buttom: if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] elif last_check > 0 and ( * 0.85 - each_line['top']) > 0 and not (check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] last_top = each_line['top'] last_check = each_line['x1'] - * 0.85 return text def drop_empty_cols(self, data): # Delete all columns with empty data transposed_data = list(map(list, zip(*data)))# Transpose data filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]# Filter out empty columns result = list(map(list, zip(*filtered_data)))# Transpose data again return result @staticmethod def keep_visible_lines(obj): """ Keep visible lines. If the object is a ``rect`` type, keep it only if the lines are visible. A visible line is the one having ``non_stroking_color`` not null. """ if obj['object_type'] == 'rect': if obj['non_stroking_color'] is None: return False if obj['width'] < 1 and obj['height'] < 1: return False # return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not None if obj['object_type'] == 'char': return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None return True def extract_text_and_tables(self, page): """ Extract text and tables from the given page. """ buttom = 0 page = (self.keep_visible_lines) tables = page.find_tables() if len(tables) >= 1: # Table data count = len(tables) for table in tables: if [3] < buttom: pass else: count -= 1 top = [1] text = self.check_lines(page, top, buttom) text_list = ('\n') for _t in range(len(text_list)): self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'text', 'inside': text_list[_t]} += 1 buttom = [3] new_table = () r_count = 0 for r in range(len(new_table)): row = new_table[r] if row[0] is None: r_count += 1 for c in range(len(row)): if row[c] is not None and row[c] not in ['', ' ']: if new_table[r - r_count][c] is None: new_table[r - r_count][c] = row[c] else: new_table[r - r_count][c] += row[c] new_table[r][c] = None else: r_count = 0 end_table = [] for row in new_table: if row[0] != None: cell_list = [] cell_check = False for cell in row: if cell != None: cell = ('\n', '') else: cell = '' if cell != '': cell_check = True cell_list.append(cell) if cell_check == True: end_table.append(cell_list) end_table = self.drop_empty_cols(end_table) for row in end_table: self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'excel', 'inside': str(row)} # self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'excel', # 'inside': ' '.join(row)} += 1 if count == 0: text = self.check_lines(page, '', buttom) text_list = ('\n') for _t in range(len(text_list)): self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'text', 'inside': text_list[_t]} += 1 else: #Text Data text = self.check_lines(page, '', '') text_list = ('\n') for _t in range(len(text_list)): self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'text', 'inside': text_list[_t]} += 1 # Handle headers and footers first_re = '[^count](?:Report(?:full text)?(?:(Revised version)|(Revised draft)|(After correction))?)$' end_re = '^(?:\d|\\|\/|The|common|Page|-|_| ){1,}' if self.last_num == 0: try: first_text = str(self.all_text[1]['inside']) end_text = str(self.all_text[len(self.all_text) - 1]['inside']) if (first_re, first_text) and not '[' in end_text: self.all_text[1]['type'] = 'Header' if (end_re, end_text) and not '[' in end_text: self.all_text[len(self.all_text) - 1]['type'] = 'footer' except: print(page.page_number) else: try: first_text = str(self.all_text[self.last_num + 2]['inside']) end_text = str(self.all_text[len(self.all_text) - 1]['inside']) if (first_re, first_text) and '[' not in end_text: self.all_text[self.last_num + 2]['type'] = 'Header' if (end_re, end_text) and '[' not in end_text: self.all_text[len(self.all_text) - 1]['type'] = 'footer' except: print(page.page_number) self.last_num = len(self.all_text) - 1 def process_pdf(self): """ Process the entire PDF document. """ for i in range(len()): self.extract_text_and_tables([i]) def save_all_text(self, path): """ Saves all extracted text to the file with the specified path. """ with open(path, 'w', encoding='utf-8') as file: for key in self.all_text.keys(): ((self.all_text[key], ensure_ascii=False) + '\n') def process_all_pdfs_in_folder(folder_path): """ Process all PDF files in the specified folder. """ file_paths = (f'{folder_path}/*') file_paths = sorted(file_paths, reverse=True) for file_path in file_paths: print(file_path) try: processor = PDFProcessor(file_path) processor.process_pdf() save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt') processor.save_all_text(save_path) except: print('check') if __name__ == '__main__': # The path of the pdf file that needs to be parsed pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__Shanghai Aixu New Energy Co., Ltd.__600732__Aixu Shares__2019__Annual Report.pdf' # txt content file parsed by pdf out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__Shanghai Aixu New Energy Co., Ltd.__600732__Aixu Shares__2019__Annual Report.txt' processor = PDFProcessor(pdf_path) processor.process_pdf() processor.save_all_text(out_path)
Extract pictures from PDF
Extract the image in the PDF and save it locally
import pdfplumber import os # Define a function to extract images in PDF and save themdef extract_images_from_pdf(pdf_file, output_folder): # Create an output folder if it does not exist if not (output_folder): (output_folder) with (pdf_file) as pdf: #Travel every page for page_number, page in enumerate(, start=1): print(f'page number:{page.page_number}') print(f'Page width:{}') print(f'Page height:{}') # Get all pictures of this page images = # traverse all pictures on this page for idx, image in enumerate(images, start=1): # Get binary data of the image image_data = image['stream'].get_data() # Build image file name image_filename = (output_folder, f'image_{page_number}_{idx}.png') # Save the picture to the file with open(image_filename, 'wb') as f: (image_data) print(f'The image has been saved to:{image_filename}') # Example usagepdf_file = '' output_folder = 'extracted_images' extract_images_from_pdf(pdf_file, output_folder)
Extract pdf table text
Save as excel file
import pdfplumber from openpyxl import Workbook # Define a function to extract tables in PDF and save them as Excel filesdef extract_tables_to_excel(pdf_file, excel_output_file): with (pdf_file) as pdf: workbook = Workbook() sheet = #Travel every page for page in : # Extract the table for this page table = page.extract_table() # If the table exists, write it to an Excel file if table: for row in table: (row) # Save Excel file (excel_output_file) # Example usagepdf_file = '' excel_output_file = '' extract_tables_to_excel(pdf_file, excel_output_file)
Save as a text file
import pdfplumber # Define a function to extract tables in PDF and save them as text filesdef extract_tables_to_text(pdf_file, text_output_file): with (pdf_file) as pdf: with open(text_output_file, 'w', encoding='utf-8') as output: #Travel every page for page in : # Extract the table for this page table = page.extract_table() # If the table exists, write it to a text file if table: for row in table: ('\t'.join(str(cell) for cell in row) + '\n') # Example usagepdf_file = '' text_output_file = '' extract_tables_to_text(pdf_file, text_output_file)
Extract PDF plain text
import pdfplumber # Define a function to extract plain text in PDF and save it as a text filedef extract_text_to_file(pdf_file, text_output_file): with (pdf_file) as pdf: with open(text_output_file, 'w', encoding='utf-8') as output: #Travel every page for page in : # Extract the text of this page text = page.extract_text() # If the text exists, write it to the text file if text: (text) # Example usagepdf_file = '' text_output_file = '' extract_text_to_file(pdf_file, text_output_file)
Read rich text txt
There are three types of python reading file functions: read(), readline(), and readlines()
- read() read all text at once
- readline() readline()
- readlines() reads all content and returns in the format of a sequence
# Read all text at oncewith open('', 'r', encoding='utf-8') as f: data = () print(data) # Read the content of the first linewith open('', 'r', encoding='utf-8') as f: data = () print(data) # Read all content, read line by line and remove line breakswith open('', 'r', encoding='utf-8') as f: for line in (): line = ('\n') print(line)
The above is the detailed content of Python using pdfplumber for pdf document analysis and extraction. For more information about python pdfplumber parsing pdf, please follow my other related articles!