Caffe2 - Python API
A deep learning, cross platform ML framework
create_your_own_dataset.py
1 #########################################################
2 #
3 # DO NOT EDIT THIS FILE. IT IS GENERATED AUTOMATICALLY. #
4 # PLEASE LOOK INTO THE README FOR MORE INFORMATION. #
5 #
6 #########################################################
7 
8 
9 # coding: utf-8
10 
11 # # How do I create my own dataset?
12 #
13 # So Caffe2 uses a binary DB format to store the data that we would like to train models on. A Caffe2 DB is a glorified name of a key-value storage where the keys are usually randomized so that the batches are approximately i.i.d. The values are the real stuff here: they contain the serialized strings of the specific data formats that you would like your training algorithm to ingest. So, the stored DB would look (semantically) like this:
14 #
15 # key1 value1
16 # key2 value2
17 # key3 value3
18 # ...
19 #
20 # To a DB, it treats the keys and values as strings, but you probably want structured contents. One way to do this is to use a TensorProtos protocol buffer: it essentially wraps Tensors, aka multi-dimensional arrays, together with the tensor data type and shape information. Then, one can use the TensorProtosDBInput operator to load the data into an SGD training fashion.
21 #
22 # Here, we will show you one example of how to create your own dataset. To this end, we will use the UCI Iris dataset - which was a very popular classical dataset for classifying Iris flowers. It contains 4 real-valued features representing the dimensions of the flower, and classifies things into 3 types of Iris flowers. The dataset can be downloaded [here](https://archive.ics.uci.edu/ml/datasets/Iris).
23 
24 # In[1]:
25 
26 
27 # First let's import a few things needed.
28 import urllib2 # for downloading the dataset from the web.
29 import numpy as np
30 from matplotlib import pyplot
31 from StringIO import StringIO
32 from caffe2.python import core, utils, workspace
33 from caffe2.proto import caffe2_pb2
34 
35 
36 # In[2]:
37 
38 
39 f = urllib2.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
40 raw_data = f.read()
41 print('Raw data looks like this:')
42 print(raw_data[:100] + '...')
43 
44 
45 # In[3]:
46 
47 
48 # load the features to a feature matrix.
49 features = np.loadtxt(StringIO(raw_data), dtype=np.float32, delimiter=',', usecols=(0, 1, 2, 3))
50 # load the labels to a feature matrix
51 label_converter = lambda s : {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}[s]
52 labels = np.loadtxt(StringIO(raw_data), dtype=np.int, delimiter=',', usecols=(4,), converters={4: label_converter})
53 
54 
55 # Before we do training, one thing that is often beneficial is to separate the dataset into training and testing. In this case, let's randomly shuffle the data, use the first 100 data points to do training, and the remaining 50 to do testing. For more sophisticated approaches, you can use e.g. cross validation to separate your dataset into multiple training and testing splits. Read more about cross validation [here](http://scikit-learn.org/stable/modules/cross_validation.html).
56 
57 # In[4]:
58 
59 
60 random_index = np.random.permutation(150)
61 features = features[random_index]
62 labels = labels[random_index]
63 
64 train_features = features[:100]
65 train_labels = labels[:100]
66 test_features = features[100:]
67 test_labels = labels[100:]
68 
69 
70 # In[5]:
71 
72 
73 # Let's plot the first two features together with the label.
74 # Remember, while we are plotting the testing feature distribution
75 # here too, you might not be supposed to do so in real research,
76 # because one should not peek into the testing data.
77 legend = ['rx', 'b+', 'go']
78 pyplot.title("Training data distribution, feature 0 and 1")
79 for i in range(3):
80  pyplot.plot(train_features[train_labels==i, 0], train_features[train_labels==i, 1], legend[i])
81 pyplot.figure()
82 pyplot.title("Testing data distribution, feature 0 and 1")
83 for i in range(3):
84  pyplot.plot(test_features[test_labels==i, 0], test_features[test_labels==i, 1], legend[i])
85 
86 
87 # Now, as promised, let's put things into a Caffe2 DB. In this DB, what would happen is that we will use "train_xxx" as the key, and use a TensorProtos object to store two tensors for each data point: one as the feature and one as the label. We will use Caffe2 python's DB interface to do so.
88 
89 # In[6]:
90 
91 
92 # First, let's see how one can construct a TensorProtos protocol buffer from numpy arrays.
93 feature_and_label = caffe2_pb2.TensorProtos()
94 feature_and_label.protos.extend([
95  utils.NumpyArrayToCaffe2Tensor(features[0]),
96  utils.NumpyArrayToCaffe2Tensor(labels[0])])
97 print('This is what the tensor proto looks like for a feature and its label:')
98 print(str(feature_and_label))
99 print('This is the compact string that gets written into the db:')
100 print(feature_and_label.SerializeToString())
101 
102 
103 # In[7]:
104 
105 
106 # Now, actually write the db.
107 
108 def write_db(db_type, db_name, features, labels):
109  db = core.C.create_db(db_type, db_name, core.C.Mode.write)
110  transaction = db.new_transaction()
111  for i in range(features.shape[0]):
112  feature_and_label = caffe2_pb2.TensorProtos()
113  feature_and_label.protos.extend([
114  utils.NumpyArrayToCaffe2Tensor(features[i]),
115  utils.NumpyArrayToCaffe2Tensor(labels[i])])
116  transaction.put(
117  'train_%03d'.format(i),
118  feature_and_label.SerializeToString())
119  # Close the transaction, and then close the db.
120  del transaction
121  del db
122 
123 write_db("minidb", "iris_train.minidb", train_features, train_labels)
124 write_db("minidb", "iris_test.minidb", test_features, test_labels)
125 
126 
127 # Now, let's create a very simple network that only consists of one single TensorProtosDBInput operator, to showcase how we load data from the DB that we created. For training, you might want to do something more complex: creating a network, train it, get the model, and run the prediction service. To this end you can look at the MNIST tutorial for details.
128 
129 # In[8]:
130 
131 
132 net_proto = core.Net("example_reader")
133 dbreader = net_proto.CreateDB([], "dbreader", db="iris_train.minidb", db_type="minidb")
134 net_proto.TensorProtosDBInput([dbreader], ["X", "Y"], batch_size=16)
135 
136 print("The net looks like this:")
137 print(str(net_proto.Proto()))
138 
139 
140 # In[9]:
141 
142 
143 workspace.CreateNet(net_proto)
144 
145 
146 # In[10]:
147 
148 
149 # Let's run it to get batches of features.
150 workspace.RunNet(net_proto.Proto().name)
151 print("The first batch of feature is:")
152 print(workspace.FetchBlob("X"))
153 print("The first batch of label is:")
154 print(workspace.FetchBlob("Y"))
155 
156 # Let's run again.
157 workspace.RunNet(net_proto.Proto().name)
158 print("The second batch of feature is:")
159 print(workspace.FetchBlob("X"))
160 print("The second batch of label is:")
161 print(workspace.FetchBlob("Y"))
162 
163