Python uses pdfplumber to parse and extract PDF documents

Features of pdfplumber

1. It is a pure python third-party library, suitable for python version

, It is used to view various PDF information, and can effectively extract text and tables

3. It does not support modifying or generating PDFs, nor does it support processing of PDF scans.

import glob
import pdfplumber
import re
from collections import defaultdict
import json

class PDFProcessor:
    def __init__(self, filepath):
         = filepath
        #Open the document and pay attention to the storage location         = (filepath)
        self.all_text = defaultdict(dict)
         = 0
        self.last_num = 0

    def check_lines(self, page, top, buttom):
    	"""
         Used to inspect rows in the page and merge rows according to the given top and bottom positions.
         """
    	# Text data        lines = page.extract_words()[::]
        text = ''
        last_top = 0
        last_check = 0
        for l in range(len(lines)):
            each_line = lines[l]
            check_re = '(?:。|；|unit：Yuan|unit：万Yuan|Currency：RMB|\d|Report(?:full text)?(?:（Revised version）|（Revised draft）|（After correction）)?)$'
            if top == '' and buttom == '':
                if abs(last_top - each_line['top']) &lt;= 2:
                    text = text + each_line['text']
                #elif last_check &gt; 0 and ( * 0.85 - each_line['top']) &gt; 0 and not (check_re, text):
                elif last_check &gt; 0 and ( * 0.9 - each_line['top']) &gt; 0 and not (check_re, text):

                    text = text + each_line['text']
                else:
                    text = text + '\n' + each_line['text']
            elif top == '':
                if each_line['top'] &gt; buttom:
                    if abs(last_top - each_line['top']) &lt;= 2:
                        text = text + each_line['text']
                    elif last_check &gt; 0 and ( * 0.85 - each_line['top']) &gt; 0 and not (check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            else:
                if each_line['top'] &lt; top and each_line['top'] &gt; buttom:
                    if abs(last_top - each_line['top']) &lt;= 2:
                        text = text + each_line['text']
                    elif last_check &gt; 0 and ( * 0.85 - each_line['top']) &gt; 0 and not (check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            last_top = each_line['top']
            last_check = each_line['x1'] -  * 0.85

        return text

    def drop_empty_cols(self, data):
        # Delete all columns with empty data        transposed_data = list(map(list, zip(*data)))# Transpose data        filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]# Filter out empty columns        result = list(map(list, zip(*filtered_data)))# Transpose data again        return result

    @staticmethod
    def keep_visible_lines(obj):
        """
         Keep visible lines.
         If the object is a ``rect`` type, keep it only if the lines are visible.

         A visible line is the one having ``non_stroking_color`` not null.
         """
        if obj['object_type'] == 'rect':
            if obj['non_stroking_color'] is None:
                return False
            if obj['width'] &lt; 1 and obj['height'] &lt; 1:
                return False
            # return obj['width'] &gt;= 1 and obj['height'] &gt;= 1 and obj['non_stroking_color'] is not None
        if obj['object_type'] == 'char':
            return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None
        return True

    def extract_text_and_tables(self, page):
    	"""
         Extract text and tables from the given page.
         """
        buttom = 0
        page = (self.keep_visible_lines)
        tables = page.find_tables()
        if len(tables) &gt;= 1:
        	# Table data            count = len(tables)
            for table in tables:
                if [3] &lt; buttom:
                    pass
                else:
                    count -= 1
                    top = [1]
                    text = self.check_lines(page, top, buttom)
                    text_list = ('\n')
                    for _t in range(len(text_list)):
                        self.all_text[] = {'page': page.page_number, 'allrow': ,
                                                      'type': 'text', 'inside': text_list[_t]}
                         += 1

                    buttom = [3]
                    new_table = ()
                    r_count = 0
                    for r in range(len(new_table)):
                        row = new_table[r]
                        if row[0] is None:
                            r_count += 1
                            for c in range(len(row)):
                                if row[c] is not None and row[c] not in ['', ' ']:
                                    if new_table[r - r_count][c] is None:
                                        new_table[r - r_count][c] = row[c]
                                    else:
                                        new_table[r - r_count][c] += row[c]
                                    new_table[r][c] = None
                        else:
                            r_count = 0

                    end_table = []
                    for row in new_table:
                        if row[0] != None:
                            cell_list = []
                            cell_check = False
                            for cell in row:
                                if cell != None:
                                    cell = ('\n', '')
                                else:
                                    cell = ''
                                if cell != '':
                                    cell_check = True
                                cell_list.append(cell)
                            if cell_check == True:
                                end_table.append(cell_list)
                    end_table = self.drop_empty_cols(end_table)

                    for row in end_table:
                        self.all_text[] = {'page': page.page_number, 'allrow': ,
                                                      'type': 'excel', 'inside': str(row)}
                        # self.all_text[] = {'page': page.page_number, 'allrow': , 'type': 'excel',
                        #                               'inside': ' '.join(row)}
                         += 1

                    if count == 0:
                        text = self.check_lines(page, '', buttom)
                        text_list = ('\n')
                        for _t in range(len(text_list)):
                            self.all_text[] = {'page': page.page_number, 'allrow': ,
                                                          'type': 'text', 'inside': text_list[_t]}
                             += 1

        else:
        	#Text Data            text = self.check_lines(page, '', '')
            text_list = ('\n')
            for _t in range(len(text_list)):
                self.all_text[] = {'page': page.page_number, 'allrow': ,
                                              'type': 'text', 'inside': text_list[_t]}
                 += 1
		# Handle headers and footers        first_re = '[^count](?:Report(?:full text)?(?:（Revised version）|（Revised draft）|（After correction）)?)$'
        end_re = '^(?:\d|\\|\/|The|common|Page|-|_| ){1,}'
        if self.last_num == 0:
            try:
                first_text = str(self.all_text[1]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if (first_re, first_text) and not '[' in end_text:
                    self.all_text[1]['type'] = 'Header'
                    if (end_re, end_text) and not '[' in end_text:
                        self.all_text[len(self.all_text) - 1]['type'] = 'footer'
            except:
                print(page.page_number)
        else:
            try:
                first_text = str(self.all_text[self.last_num + 2]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if (first_re, first_text) and '[' not in end_text:
                    self.all_text[self.last_num + 2]['type'] = 'Header'
                if (end_re, end_text) and '[' not in end_text:
                    self.all_text[len(self.all_text) - 1]['type'] = 'footer'
            except:
                print(page.page_number)

        self.last_num = len(self.all_text) - 1


    def process_pdf(self):
    	"""
         Process the entire PDF document.
         """
        for i in range(len()):
            self.extract_text_and_tables([i])


    def save_all_text(self, path):
    	"""
         Saves all extracted text to the file with the specified path.
         """
        with open(path, 'w', encoding='utf-8') as file:
            for key in self.all_text.keys():
                ((self.all_text[key], ensure_ascii=False) + '\n')


def process_all_pdfs_in_folder(folder_path):
	"""
     Process all PDF files in the specified folder.
     """
    file_paths = (f'{folder_path}/*')
    file_paths = sorted(file_paths, reverse=True)

    for file_path in file_paths:
        print(file_path)
        try:
            processor = PDFProcessor(file_path)
            processor.process_pdf()
            save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt')
            processor.save_all_text(save_path)
        except:
            print('check')

if __name__ == '__main__':
    # The path of the pdf file that needs to be parsed    pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__Shanghai Aixu New Energy Co., Ltd.__600732__Aixu Shares__2019__Annual Report.pdf'
    # txt content file parsed by pdf    out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__Shanghai Aixu New Energy Co., Ltd.__600732__Aixu Shares__2019__Annual Report.txt'
    processor = PDFProcessor(pdf_path)
    processor.process_pdf()
    processor.save_all_text(out_path)

Extract pictures from PDF

Extract the image in the PDF and save it locally

import pdfplumber
import os

# Define a function to extract images in PDF and save themdef extract_images_from_pdf(pdf_file, output_folder):
    # Create an output folder if it does not exist    if not (output_folder):
        (output_folder)

    with (pdf_file) as pdf:
        #Travel every page        for page_number, page in enumerate(, start=1):
            print(f'page number：{page.page_number}')
            print(f'Page width：{}')
            print(f'Page height：{}')
            
            # Get all pictures of this page            images = 
            
            # traverse all pictures on this page            for idx, image in enumerate(images, start=1):
                # Get binary data of the image                image_data = image['stream'].get_data()
                
                # Build image file name                image_filename = (output_folder, f'image_{page_number}_{idx}.png')
                
                # Save the picture to the file                with open(image_filename, 'wb') as f:
                    (image_data)
                    print(f'The image has been saved to：{image_filename}')

# Example usagepdf_file = ''
output_folder = 'extracted_images'
extract_images_from_pdf(pdf_file, output_folder)

Extract pdf table text

Save as excel file

import pdfplumber
from openpyxl import Workbook

# Define a function to extract tables in PDF and save them as Excel filesdef extract_tables_to_excel(pdf_file, excel_output_file):
    with (pdf_file) as pdf:
        workbook = Workbook()
        sheet = 
        
        #Travel every page        for page in :
            # Extract the table for this page            table = page.extract_table()
            
            # If the table exists, write it to an Excel file            if table:
                for row in table:
                    (row)
        
        # Save Excel file        (excel_output_file)

# Example usagepdf_file = ''
excel_output_file = ''
extract_tables_to_excel(pdf_file, excel_output_file)

Save as a text file

import pdfplumber

# Define a function to extract tables in PDF and save them as text filesdef extract_tables_to_text(pdf_file, text_output_file):
    with (pdf_file) as pdf:
        with open(text_output_file, 'w', encoding='utf-8') as output:
            #Travel every page            for page in :
                # Extract the table for this page                table = page.extract_table()
                
                # If the table exists, write it to a text file                if table:
                    for row in table:
                        ('\t'.join(str(cell) for cell in row) + '\n')

# Example usagepdf_file = ''
text_output_file = ''
extract_tables_to_text(pdf_file, text_output_file)

Extract PDF plain text

import pdfplumber

# Define a function to extract plain text in PDF and save it as a text filedef extract_text_to_file(pdf_file, text_output_file):
    with (pdf_file) as pdf:
        with open(text_output_file, 'w', encoding='utf-8') as output:
            #Travel every page            for page in :
                # Extract the text of this page                text = page.extract_text()
                
                # If the text exists, write it to the text file                if text:
                    (text)

# Example usagepdf_file = ''
text_output_file = ''
extract_text_to_file(pdf_file, text_output_file)

Read rich text txt

There are three types of python reading file functions: read(), readline(), and readlines()

read() read all text at once
readline() readline()
readlines() reads all content and returns in the format of a sequence

# Read all text at oncewith open('', 'r', encoding='utf-8') as f:
    data = ()
    print(data)

# Read the content of the first linewith open('', 'r', encoding='utf-8') as f:
    data = ()
    print(data)

# Read all content, read line by line and remove line breakswith open('', 'r', encoding='utf-8') as f:
    for line in ():
        line = ('\n')
        print(line)

The above is the detailed content of Python using pdfplumber for pdf document analysis and extraction. For more information about python pdfplumber parsing pdf, please follow my other related articles!