In crnn training time need to use lmdb format dataset, the following is python to generate lmdb is the code of the dataset, pay attention to the linux system, otherwise it will be read into the image of the time out of the problem, the problem may be encountered in the code inside the comments, look at the code can be.
#-*- coding:utf-8 -*- import os import lmdb#pip install this module first import cv2 import glob import numpy as np def checkImageIsValid(imageBin): if imageBin is None: return False imageBuf = (imageBin, dtype=np.uint8) img = (imageBuf, cv2.IMREAD_GRAYSCALE) if img is None: return False imgH, imgW = [0], [1] if imgH * imgW == 0: return False return True def writeCache(env, cache): with (write=True) as txn: for k, v in (): (k, v) def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True): """ Create LMDB dataset for CRNN training. # ARGS: outputPath : LMDB output path imagePathList : list of image path labelList : list of corresponding groundtruth texts lexiconList : (optional) list of lexicon lists checkValid : if true, check the validity of every image """ # print (len(imagePathList) , len(labelList)) assert(len(imagePathList) == len(labelList)) nSamples = len(imagePathList) print '...................' env = (outputPath, map_size=8589934592)#1099511627776) the minimum value of disk space needed, before 1T, I changed it to 8g, otherwise it will report insufficient disk space, this number is byte cache = {} cnt = 1 for i in xrange(nSamples): imagePath = imagePathList[i] label = labelList[i] if not (imagePath): print('%s does not exist' % imagePath) continue with open(imagePath, 'r') as f: imageBin = () if checkValid: if not checkImageIsValid(imageBin): print('%s is not a valid image' % imagePath)#Note that it must be under linux, otherwise it won't be available and will output this message continue imageKey = 'image-%09d' % cnt labelKey = 'label-%09d' % cnt cache[imageKey] = imageBin cache[labelKey] = label if lexiconList: lexiconKey = 'lexicon-%09d' % cnt cache[lexiconKey] = ' '.join(lexiconList[i]) if cnt % 1000 == 0: writeCache(env, cache) cache = {} print('Written %d / %d' % (cnt, nSamples)) cnt += 1 nSamples = cnt - 1 cache['num-samples'] = str(nSamples) writeCache(env, cache) print('Created dataset with %d samples' % nSamples) def read_text(path): with open(path) as f: text = () text = () return text if __name__ == '__main__': # lmdb output directory outputPath = 'D:/ruanjianxiazai/tuxiangyangben/fengehou/train'#The training set and the validation set have to run this program twice, in two separate generations path = "D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg"# Putting txt and jpg's all in the same file imagePathList = (path) print '------------',len(imagePathList),'------------' imgLabelLists = [] for p in imagePathList: try: ((p, read_text(('.jpg', '.txt')))) except: continue # imgLabelList = [ (p, read_text(('.jpg', '.txt'))) for p in imagePathList] # sort by labelList imgLabelList = sorted(imgLabelLists, key = lambda x:len(x[1])) imgPaths = [ p[0] for p in imgLabelList] txtLists = [ p[1] for p in imgLabelList] createDataset(outputPath, imgPaths, txtLists, lexiconList=None, checkValid=True)
The above example of this python generate lmdb format file is all that I have shared with you, I hope it will give you a reference, and I hope you will support me more.