SoFunction
Updated on 2024-11-17

Example of python generating a file in lmdb format

In crnn training time need to use lmdb format dataset, the following is python to generate lmdb is the code of the dataset, pay attention to the linux system, otherwise it will be read into the image of the time out of the problem, the problem may be encountered in the code inside the comments, look at the code can be.

#-*- coding:utf-8 -*-
 
import os
import lmdb#pip install this module first
import cv2
import glob
import numpy as np
 
 
def checkImageIsValid(imageBin):
 if imageBin is None:
  return False
 imageBuf = (imageBin, dtype=np.uint8)
 img = (imageBuf, cv2.IMREAD_GRAYSCALE)
 if img is None:
  return False
 imgH, imgW = [0], [1]
 if imgH * imgW == 0:
  return False
 return True
 
def writeCache(env, cache):
 with (write=True) as txn:
  for k, v in ():
   (k, v)
 
def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
 """
 Create LMDB dataset for CRNN training.
# ARGS:
  outputPath : LMDB output path
  imagePathList : list of image path
  labelList  : list of corresponding groundtruth texts
  lexiconList : (optional) list of lexicon lists
  checkValid : if true, check the validity of every image
 """
 # print (len(imagePathList) , len(labelList))
 assert(len(imagePathList) == len(labelList))
 nSamples = len(imagePathList)
 print '...................'
 env = (outputPath, map_size=8589934592)#1099511627776) the minimum value of disk space needed, before 1T, I changed it to 8g, otherwise it will report insufficient disk space, this number is byte
 
 cache = {}
 cnt = 1
 for i in xrange(nSamples):
  imagePath = imagePathList[i]
  label = labelList[i]
  if not (imagePath):
   print('%s does not exist' % imagePath)
   continue
  with open(imagePath, 'r') as f:
   imageBin = ()
  if checkValid:
   if not checkImageIsValid(imageBin):
    print('%s is not a valid image' % imagePath)#Note that it must be under linux, otherwise it won't be available and will output this message
    continue
 
  imageKey = 'image-%09d' % cnt
  labelKey = 'label-%09d' % cnt
  cache[imageKey] = imageBin
  cache[labelKey] = label
  if lexiconList:
   lexiconKey = 'lexicon-%09d' % cnt
   cache[lexiconKey] = ' '.join(lexiconList[i])
  if cnt % 1000 == 0:
   writeCache(env, cache)
   cache = {}
   print('Written %d / %d' % (cnt, nSamples))
  cnt += 1
 nSamples = cnt - 1
 cache['num-samples'] = str(nSamples)
 writeCache(env, cache)
 print('Created dataset with %d samples' % nSamples)
 
 
def read_text(path):
 
 with open(path) as f:
  text = ()
 text = ()
 
 return text
 
 
if __name__ == '__main__':
 # lmdb output directory
 outputPath = 'D:/ruanjianxiazai/tuxiangyangben/fengehou/train'#The training set and the validation set have to run this program twice, in two separate generations
 
 path = "D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg"# Putting txt and jpg's all in the same file
 imagePathList = (path)
 print '------------',len(imagePathList),'------------'
 imgLabelLists = []
 for p in imagePathList:
  try:
   ((p, read_text(('.jpg', '.txt'))))
  except:
   continue
   
 # imgLabelList = [ (p, read_text(('.jpg', '.txt'))) for p in imagePathList]
 # sort by labelList
 imgLabelList = sorted(imgLabelLists, key = lambda x:len(x[1]))
 imgPaths = [ p[0] for p in imgLabelList]
 txtLists = [ p[1] for p in imgLabelList]
 
 createDataset(outputPath, imgPaths, txtLists, lexiconList=None, checkValid=True)
 

The above example of this python generate lmdb format file is all that I have shared with you, I hope it will give you a reference, and I hope you will support me more.