深度学习——数据篇
Yang Li Lv1

介绍

以下使用公开数据集,是在公开数据集上进行了一些图像变换和转换。在建模之前的一些必要处理。

把数据相关的处理都放到这一章的内容中进行介绍。

注意,开源数据集有一些是可以直接通过pytorch的包load之后直接使用的,没有必要全都转化成.jpg这样的图片格式。有一些数据集下载下来是纯图片和标签,就需要我们手动进行处理,这里只是为了分享经验。如果是打算直接从图片开始,可以跳过前面的部分。

1.数据集

下载开源数据集

数据集去官网即可找到,这里举一个例子。可以在下面的链接下载官方版本,我下载的是FIFAR-100 Python版本。
https://www.cs.toronto.edu/~kriz/cifar.html

2.数据集转图片

下载解压后将数据集可以转换成图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import numpy as np
import cv2

source_path = os.getcwd()
#官方给出的python3解压数据文件函数,返回数据字典
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict

loc_1 = './data/cifar-10/train_cifar10/'
loc_2 = './data/cifar-10/test_cifar10/'

#判断文件夹是否存在,不存在的话创建文件夹
if os.path.exists(loc_1) == False:
os.mkdir(loc_1)
if os.path.exists(loc_2) == False:
os.mkdir(loc_2)


#训练集有五个批次,每个批次10000个图片,测试集有10000张图片
def cifar10_img(file_dir):
for i in range(1,6):
data_name = file_dir + '/'+'data_batch_'+ str(i)
data_dict = unpickle(data_name)
print(data_name + ' is processing')

for j in range(10000):
img = np.reshape(data_dict[b'data'][j],(3,32,32))
img = np.transpose(img,(1,2,0))
#通道顺序为RGB
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
#要改成不同的形式的文件只需要将文件后缀修改即可
img_name = loc_1 + str(data_dict[b'labels'][j]) + str((i)*10000 + j) + '.jpg'
cv2.imwrite(img_name,img)

print(data_name + ' is done')


test_data_name = file_dir + '/test_batch'
print(test_data_name + ' is processing')
test_dict = unpickle(test_data_name)

for m in range(10000):
img = np.reshape(test_dict[b'data'][m], (3, 32, 32))
img = np.transpose(img, (1, 2, 0))
# 通道顺序为RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 要改成不同的形式的文件只需要将文件后缀修改即可
img_name = loc_2 + str(test_dict[b'labels'][m]) + str(10000 + m) + '.jpg'
cv2.imwrite(img_name, img)
print(test_data_name + ' is done')
print('Finish transforming to image')
if __name__ == '__main__':
file_dir = os.path.join(source_path,'data/cifar-10-python/cifar-10-batches-py')
cifar10_img(file_dir)

3.提取标签

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os 
import numpy as np
import cv2
import json

source_path = os.getcwd()
anno_loc = os.path.join(source_path,'data/annotations/')

#判断文件夹是否存在,不存在的话创建文件夹
if os.path.exists(anno_loc) == False:
os.mkdir(anno_loc)

#用于存放图片文件名及标注
train_filenames = []
train_annotations = []

test_filenames = []
test_annotations= []

#训练集有五个批次,每个批次10000个图片,测试集有10000张图片
def cifar10_annotations(file_dir):
print('creat train_img annotations')
for i in range(1,6):
data_name = file_dir + '/' + 'data_batch_' + str(i)
data_dict = unpickle(data_name)
print(data_name + ' is processing')
for j in range(10000):
img_name = str(data_dict[b'labels'][j]) + str((i) * 10000 + j) + '.jpg'
img_annotations = data_dict[b'labels'][j]
train_filenames.append(img_name)
train_annotations.append(img_annotations)
print(data_name + ' is done')

test_data_name = file_dir + '/test_batch'
print(test_data_name + ' is processing')
test_dict = unpickle(test_data_name)

for m in range(10000):
testimg_name = str(test_dict[b'labels'][m]) + str(10000 + m) + '.jpg'
testimg_annotations = test_dict[b'labels'][m] #str(test_dict[b'labels'][m]) test_dict[b'labels'][m]
test_filenames.append(testimg_name)
test_annotations.append(testimg_annotations)

print(test_data_name + ' is done')
print('Finish file processing')


if __name__ == '__main__':

file_dir = os.path.join(source_path,'data/cifar-10-python/cifar-10-batches-py')
cifar10_annotations(file_dir)

train_annot_dict = {
'images': train_filenames,
'categories': train_annotations
}
test_annot_dict = {
'images':test_filenames,
'categories':test_annotations
}
# print(annotation)

train_json = json.dumps(train_annot_dict)
train_file = open(os.path.join(source_path,'data/annotations/cifar10_train.json'), 'w')
train_file.write(train_json)
train_file.close()

test_json =json.dumps(test_annot_dict)
test_file = open(os.path.join(source_path,'data/annotations/cifar10_test.json'),'w')
test_file.write(test_json)
test_file.close()
print('annotations have writen to json file')

4.生成标注文件

1
2
3
4
5
6
7
8
9
10
11
 #生成标注文件
import json
save_path_json_train = r'.\jupyter\data\train.json'
save_path_json_test = r'.\data\test.json'

label = {}
标签为0和1
for i in name_type_df["name"]:
label.update({i:1})
for i in name_type_df2["name"]:
label.update({i:0})

5.自己分割训练集和测试集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
from torch.utils.data import random_split
dataset = list(label)
# 以70%和30%的比例分割
train_dataset, test_dataset = random_split(
dataset=dataset,
lengths=[int(len(label)*0.7), len(label)-int(0.7*len(label))],
generator=torch.Generator().manual_seed(0)
)

train_lab = {}
for i in train_dataset:
train_lab.update({i:label[i]})
test_lab = {}
for i in test_dataset:
test_lab.update({i:label[i]})

# 存储标签数据到json文件
a = json.dumps(train_lab)
f1 = open(save_path_json_train, 'w')
f1.write(a)
f1.close()
b = json.dumps(test_lab)
f2 = open(save_path_json_test, 'w')
f2.write(b)
f2.close()

# 存储图像
train_pic_path = r".\jupyter\data\train_pic"
test_pic_path = r".\jupyter\data\test_pic"

from torchvision import transforms
from PIL import Image
# 将图像归一化的尺寸,255
crop = transforms.RandomResizedCrop(256)
list_img_size = []
for i in train_lab:
org_path = os.path.join(picture_path,i)
new_path = os.path.join(train_pic_path,i)
img = Image.open(org_path)
# 储存了原始的图像大小信息
list_img_size.append(img.size)
croped_img=crop(img)
shutil.copy2(os.path.join(picture_path,i),os.path.join(train_pic_path,i))
for i in test_lab:
org_path = os.path.join(picture_path,i)
new_path = os.path.join(test_pic_path,i)
img = Image.open(org_path)
croped_img=crop(img)
shutil.copy2(os.path.join(picture_path,i),os.path.join(test_pic_path,i))

6.从切分后的图像中加载数据集

加载数据集这一块涉及到很多的数据调整,包括裁剪,batchsize等等。

自定义图片数据集的加载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import os
import argparse
import torchvision
from torchvision import datasets, transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

class _IMG(Dataset):
def __init__(self, root, train=True, transform = None, target_transform=None):
super(NIR_IMG, self).__init__()
self.train = train
# 加载数据集的时候调整图片大小
self.transform = transforms.Compose([transforms.Resize(225), transforms.CenterCrop(224), transforms.ToTensor()])
self.target_transform = target_transform

#如果是训练则加载训练集,如果是测试则加载测试集
if self.train:
file_annotation = os.path.join(source_path,'data/train.json')
img_folder = os.path.join(source_path,'data/train_pic/')
else:
file_annotation = os.path.join(source_path,'data/test.json')
img_folder = os.path.join(source_path,'data/test_pic/')
fp = open(file_annotation,'r')
data_dict = json.load(fp)

#如果图像数和标签数不匹配说明数据集标注生成有问题,报错提示
num_data = len(data_dict)

self.filenames = []
self.labels = []
self.img_folder = img_folder
for i in range(num_data):
self.filenames.append(list(data_dict.keys())[i])
self.labels.append(list(data_dict.values())[i])

def __getitem__(self, index):
img_name = self.img_folder + self.filenames[index]
label = self.labels[index]
img = plt.imread(img_name)
PIL_image = Image.fromarray(img) #这里ndarray_image为原来的numpy数组类型的输入

PIL_image = self.transform(PIL_image) #可以根据指定的转化形式对数据集进行转换
return PIL_image, label
def __len__(self):
return len(self.filenames)

train_data = _IMG(os.path.join(source_path,'data/train.json'), train = True)
test_data = _IMG(os.path.join(source_path,'data/test.json'), train = False)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)

transforms.Compose的一些用法

1
self.transform = transforms.Compose([transforms.Resize([224,224]), transforms.CenterCrop([224,224]), transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])

transforms的这个折腾了很长事件,它transforms.Compose是一个集合,在这个集合里面可以设置图片缩放大小,裁剪大小,归一化等等操作。

这里需要注意,当Resize只有一个参数时,是将图片的短边拉成Resize的大小,再进行中心裁剪CenterCrop时按照从中心计算的区域进行裁剪。
ToTensor是将向量进行Tensor的归一化计算。
transforms.Normalize则是规定了具体归一化的均值和方差。

解决数据不能被batch_size整除的问题,使用drop_last

1
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers,drop_last=True)

drop_last=True就不加入最后不能被整除的部分了。

7.显示单张图片

1
plt.imshow(train_loader.dataset[0][0][0], cmap='gray')
  • Post title:深度学习——数据篇
  • Post author:Yang Li
  • Create time:2022-09-20 21:12:00
  • Post link:https://yangli-os.github.io//2022/09/20/深度学习-数据篇/
  • Copyright Notice:All articles in this blog are licensed under BY-NC-SA unless stating additionally.
 Comments