lmdb_dataset.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import numpy as np
  15. import os
  16. from paddle.io import Dataset
  17. import lmdb
  18. import cv2
  19. from .imaug import transform, create_operators
  20. class LMDBDataSet(Dataset):
  21. def __init__(self, config, mode, logger, seed=None):
  22. super(LMDBDataSet, self).__init__()
  23. global_config = config['Global']
  24. dataset_config = config[mode]['dataset']
  25. loader_config = config[mode]['loader']
  26. batch_size = loader_config['batch_size_per_card']
  27. data_dir = dataset_config['data_dir']
  28. self.do_shuffle = loader_config['shuffle']
  29. self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir)
  30. logger.info("Initialize indexs of datasets:%s" % data_dir)
  31. self.data_idx_order_list = self.dataset_traversal()
  32. if self.do_shuffle:
  33. np.random.shuffle(self.data_idx_order_list)
  34. self.ops = create_operators(dataset_config['transforms'], global_config)
  35. def load_hierarchical_lmdb_dataset(self, data_dir):
  36. lmdb_sets = {}
  37. dataset_idx = 0
  38. for dirpath, dirnames, filenames in os.walk(data_dir + '/'):
  39. if not dirnames:
  40. env = lmdb.open(
  41. dirpath,
  42. max_readers=32,
  43. readonly=True,
  44. lock=False,
  45. readahead=False,
  46. meminit=False)
  47. txn = env.begin(write=False)
  48. num_samples = int(txn.get('num-samples'.encode()))
  49. lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
  50. "txn":txn, "num_samples":num_samples}
  51. dataset_idx += 1
  52. return lmdb_sets
  53. def dataset_traversal(self):
  54. lmdb_num = len(self.lmdb_sets)
  55. total_sample_num = 0
  56. for lno in range(lmdb_num):
  57. total_sample_num += self.lmdb_sets[lno]['num_samples']
  58. data_idx_order_list = np.zeros((total_sample_num, 2))
  59. beg_idx = 0
  60. for lno in range(lmdb_num):
  61. tmp_sample_num = self.lmdb_sets[lno]['num_samples']
  62. end_idx = beg_idx + tmp_sample_num
  63. data_idx_order_list[beg_idx:end_idx, 0] = lno
  64. data_idx_order_list[beg_idx:end_idx, 1] \
  65. = list(range(tmp_sample_num))
  66. data_idx_order_list[beg_idx:end_idx, 1] += 1
  67. beg_idx = beg_idx + tmp_sample_num
  68. return data_idx_order_list
  69. def get_img_data(self, value):
  70. """get_img_data"""
  71. if not value:
  72. return None
  73. imgdata = np.frombuffer(value, dtype='uint8')
  74. if imgdata is None:
  75. return None
  76. imgori = cv2.imdecode(imgdata, 1)
  77. if imgori is None:
  78. return None
  79. return imgori
  80. def get_lmdb_sample_info(self, txn, index):
  81. label_key = 'label-%09d'.encode() % index
  82. label = txn.get(label_key)
  83. if label is None:
  84. return None
  85. label = label.decode('utf-8')
  86. img_key = 'image-%09d'.encode() % index
  87. imgbuf = txn.get(img_key)
  88. return imgbuf, label
  89. def __getitem__(self, idx):
  90. lmdb_idx, file_idx = self.data_idx_order_list[idx]
  91. lmdb_idx = int(lmdb_idx)
  92. file_idx = int(file_idx)
  93. sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'],
  94. file_idx)
  95. if sample_info is None:
  96. return self.__getitem__(np.random.randint(self.__len__()))
  97. img, label = sample_info
  98. data = {'image': img, 'label': label}
  99. outs = transform(data, self.ops)
  100. if outs is None:
  101. return self.__getitem__(np.random.randint(self.__len__()))
  102. return outs
  103. def __len__(self):
  104. return self.data_idx_order_list.shape[0]