SoFunction
Updated on 2025-05-18

Python implementation to obtain table data with merged cells

Since some tables of merged cells often appear in daily operations and maintenance, if it is troublesome to obtain data, it will now be encapsulated into a class and obtain the data in the form of a list by calling list_excel_data(). dict_excel_data(): Get data in dictionary format.

When obtaining data in dictionary form, be careful that the first line is used as the key of the dictionary by default.

The code is as follows:

from openpyxl import load_workbook
 
 
class Get_table_data():
    """
     Processing table data with merged cells
     """
    def __init__(self,sh):
        """
         Define the initial incoming table
         :param sh: table
         """
         = sh
 
    def get_row_col(self):
        """
         # Get row and column information of the table :param sh:
         :return:
         """
        # title = # Get sheet name        max_row_num = .max_row  # Get the maximum number of rows        max_col_num = .max_column  # Get the maximum number of columns        # min_row_num=sh.min_row # Get the minimum number of rows        # min_col_num = sh.min_column # Get the minimum number of columns        return max_row_num,max_col_num
 
    # Get the coordinate information of the merged cell and the number of rows and columns of the merged cell    def get_merge_data(self):
        """
         Convert the merged cells to a specific format by obtaining them
         :return: Index information of merged cells
         """
        # Query all merged cells in the sheet form        merge_lists = .merged_cells
        # print('merge_lists',merge_lists)
        merge_all_list = []  # Receive the final content and return        # traverse merged cells        for merge_list in merge_lists:
            # Get the start row (row_min) and terminate row (row_max) and start column (col_min) of a single merged cell            row_min, row_max, col_min, col_max = merge_list.min_row, merge_list.max_row, merge_list.min_col, merge_list.max_col
            # Here we judge that if the rows and columns of the start and end of the merged cell are not equal, it means that the merged cell has merged both rows and columns. The two for loops are successively taken out rows and columns in x and y respectively.            if row_min != row_max and col_min != col_max:
                row_col = [(x, y) for x in range(row_min, row_max + 1) for y in range(col_min, col_max + 1)]
                merge_all_list.append(row_col)  # The retrieved value is in the list            # Here we judge that if the start and end rows of the merged cells are equal, and the start and end columns are not equal, it means that the merged cells only merge columns, so the row does not move, and only the column values ​​are looped out. If it exists in y, the row can be taken at will at will.            elif row_min == row_max and col_min != col_max:
                row_col = [(row_min, y) for y in range(col_min, col_max + 1)]
                merge_all_list.append(row_col)  # The retrieved value is in the list            # Here we judge that if the start and end rows of the merged cells are not equal, and the start and end columns are equal, it means that the merged cells only merge rows, so the column does not move, and only the row values ​​are looped out. It exists in x, and the column can be randomly taken col_min/col_max            elif row_min != row_max and col_min == col_max:
                row_col = [(x, col_min) for x in range(row_min, row_max + 1)]
                merge_all_list.append(row_col)  # The retrieved value is in the list        return merge_all_list  # Finally return to the list        # The result is a list value like this: [[(2, 1), (3, 1)], [(10, 1), (10, 2), (10, 3), (11, 1), (11, 2), (11, 3)]] 
    def merge_values(self,merge_cell):  # Pass a tuple to enter parameters        """
         Process merged cells and return the merged cell value
         :param merge_cell: merge cell information in the form of a list of embedded binary groups
         :return: Return the cell value
         """
        # Loop out the value obtained by the merged cell method (this value is still a list) and check whether the passed parameters are in these values.        for i in range(0, len(merge_cell)):
            # Get the value of the merged cell: the value of the first row and column coordinates in the upper left corner of the merged cell            cell_value = (row=merge_cell[i][0][0], column=merge_cell[i][0][1]).value
            return cell_value
 
    def list_excel_data(self):
        """
         Get all rows of data in the table in the list format
         :return: Format nested by rows
         """
        merge_list = self.get_merge_data()  # Get information about the cells merged in the table        merge_list_all = sum(merge_list,[])     # Convert merged cells into a large list        table_value = []
        for row in range(1,.max_row + 1):
            row_value = []  # Define an empty list of rows of data that holds data            for col in range(1,.max_column + 1):
                cell_data = (row, col)
                if cell_data in merge_list_all:
                    row_value.append(self.merge_values(merge_list))      # is a merge cell, then call the merge cell value to get the function                else:   # Not available, it means that it is not a merged cell, just use the normal cell method to obtain it                    row_value.append((*cell_data).value)
            table_value.append(row_value)
        return table_value
 
    def dict_excel_data(self):
        """
         Display data in the table in dictionary format
         :return: format of nested lists by line
         """
        merge_list = self.get_merge_data()  # Get information about the cells merged in the table        merge_list_all = sum(merge_list, [])  # Convert merged cells into a large list        list_val = []
        for row in range(1, .max_row + 1):
            if row > 1:  # The second line begins                dict_val = {}  # Define an empty dictionary to store data                for col in range(1, .max_column + 1):
                    title_row = (1, col)     # The first line of the table: Title bar                    cell_data = (row, col)
                    if cell_data in merge_list_all:
                        # is a merge cell, then call the merge cell value to get the function                        dict_val[self.merge_values(merge_list)] = self.merge_values(merge_list)
                    else:  # Not available, it means that it is not a merged cell, just use the normal cell method to obtain it                        dict_val[(*title_row).value] = (*cell_data).value
                list_val.append(dict_val)
        return list_val
 
# Read the excel tablewb = load_workbook('')
# Get the specified sheetsheet_sb = wb['sheet']
 
c = Get_table_data(sheet_sb)        # Create a table data object 
print(c.dict_excel_data())     # Dictionary format# print(c.list_excel_data())     # List format

Knowledge extension

Python uses xlrd to implement reading merged cells

How to operate:

1. Use xlrd's own attributes: merged_cells

# Get all merged cells in the table and return in a list form (start row, end row, start column, end column)merged = sheet.merged_cells #result:[(1,5,0,1),(5,9,0,1)]

2. Use a loop to determine whether to merge cells or normal cells, and assign the first row value in the merged cell to the merged cell

def get_cell_type(row_index, col_index):
"""You can get both merged cells and ordinary cells"""
cell_value = None
for (rlow, rhigh, clow, chigh) in merged: # traverse all merged cells position information in the table# print(rlow,rhigh,clow,chigh)
if (row_index >= rlow and row_index < rhigh): # Row coordinate judgmentif (col_index >= clow and col_index < chigh): # Column coordinate judgment# If the condition is met, assign the value of the first position of the merged cell to other merged cellscell_value = sheet.cell_value(rlow, clow)
print('Merge cells')
break # Bounce out of the loop if it does not meet the conditions to prevent overwriteelse:
print('Normal Cell')
cell_value = sheet.cell_value(row_index, col_index)
# else: Add only one cell after changing the line content 5, 0 will return 2 values ​​Normal cells/merge cells# print('Normal Cell')# cell_value = sheet.cell_value(row_index, col_index)
return cell_value
# Enter the coordinates of the cell directly.  To get cell content# print(get_cell_type(5, 0))
# Use loop to output the cell content of a columnfor i in range(1, 9):
print(get_cell_type(i, 2))

PS: The easiest operation to read merge cells in Excel files

question:

1. When outputting content, use coordinates to get print. If there is else on the outermost layer, 2 values ​​will be returned (and confirm whether there will be other problems if there is no outermost layer else)

2. It is normal when used for the first time. When used again, sheet.merged_cells returns the list as empty? ?

Solution: Add formatting_info=True to the open file and it will display normally

python reads excel and processes data of merged cells

The following code is only an example, and it is optimized as appropriate.

from openpyxl import load_workbook
from  import MergedCell
import time
 
import pandas as pd
 
 
def excel_to_md(file_path, output_file_path):
    # Replace \r\n in the original data    def replace_value(str):
        return ('\n', '').replace('\r', '') if str else ''
    start = ()
    # Use openpyxl's load_workbook to read excel cell properties and merge cell data groups    merged_xls = (load_workbook(file_path), engine="openpyxl")
    # Use pandas' read_excel to read data    with (file_path, engine="openpyxl") as xls:
        with open(output_file_path, 'w') as f:
            for sheet_name in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet_name,engine="openpyxl")
                (axis=1, how='all', inplace=True)
                (axis=0, how='all', inplace=True)
                # print(f'sheet_name: {sheet_name}, start getting merged cell collection ')                # cells_time = ()
                sheet = merged_xls.book[sheet_name]
                merged_cells = sheet.merged_cells
                # print(f'sheet_name: {sheet_name}, there are merged cells {len(merged_cells.ranges)if merged_cells else 0}')                for item in merged_cells:
                    top_col, top_row, bottom_col, bottom_row = 
                    base_value = replace_value(item.start_cell.value)
                    # 1-based index to 0-based index                    top_row -= 1
                    top_col -= 1
                    [top_row:bottom_row, top_col:bottom_col] = base_value
                # print(f'sheet_name: {sheet_name}, assign value to the merged cells to complete %.5f sec' %(()-cells_time))                # Assign an empty cell to a null character                df = ('')
                # Start by writing                (f'# {sheet_name}\n')
                for index, row in ():
                    # Handle empty table header read as Unnamed: 0 Replace with Unnamed-0                    row_str = ';'.join([f'{str(col).replace(": ","-") if "Unnamed:" in str(col) else col}:{row[col]}' for col in ])
                    replace_value(row_str)
                    (f'{row_str}\n')
    merged_xls.close()
    print(file_path+'Execution time: %.5f sec' %(()-start))
 
excel_to_md('','')

This is the article about Python implementation to obtain table data with merged cells. For more related Python content to obtain table data, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!