Caffe2 - C++ API
A deep learning, cross platform ML framework
inline_container.h
1 #pragma once
2 
3 #include <cstdio>
4 #include <cstring>
5 #include <cerrno>
6 #include <istream>
7 #include <ostream>
8 #include <fstream>
9 
10 #include <c10/core/Allocator.h>
11 #include <c10/core/Backend.h>
12 
13 #include "caffe2/core/logging.h"
14 #include "caffe2/serialize/istream_adapter.h"
15 #include "caffe2/serialize/read_adapter_interface.h"
16 
17 extern "C" {
18 typedef struct mz_zip_archive mz_zip_archive;
19 }
20 
21 // PyTorch containers are a special zip archive with the following layout
22 // archive_name.zip contains:
23 // archive_name/
24 // version # a file with a single decimal number written in ascii,
25 // # used to establish the version of the archive format
26 // model.json # overall model description, this is a json output of
27 // # ModelDef from torch.proto
28 // # the following names are by convention only, model.json will
29 // # refer to these files by full names
30 // tensors/
31 // 0 # flat storage for tensor data, meta-data about shapes, etc. is
32 // # in model.json
33 // 1
34 // ...
35 // # code entries will only exist for modules that have methods attached
36 // code/
37 // archive_name.py # serialized torch script code (python syntax, using PythonPrint)
38 // archive_name_my_submodule.py # submodules have separate files
39 //
40 // The PyTorchStreamWriter also ensures additional useful properties for these files
41 // 1. All files are stored uncompressed.
42 // 2. All files in the archive are aligned to 64 byte boundaries such that
43 // it is possible to mmap the entire file and get an aligned pointer to
44 // tensor data.
45 // 3. We universally write in ZIP64 format for consistency.
46 
47 // The PyTorchStreamReader also provides additional properties:
48 // 1. It can read zip files that are created with common
49 // zip tools. This means that even though our writer doesn't compress files,
50 // the reader can still read files that were compressed.
51 // 2. It provides a getRecordOffset function which returns the offset into the
52 // raw file where file data lives. If the file was written with PyTorchStreamWriter
53 // it is guarenteed to be 64 byte aligned.
54 
55 // PyTorchReader/Writer handle checking the version number on the archive format
56 // and ensure that all files are written to a archive_name directory so they
57 // unzip cleanly.
58 
59 // When developing this format we want to pay particular attention to the
60 // following use cases:
61 //
62 // -- Reading --
63 // 1) Reading with full random access
64 // a) Reading with file api's such as fread()
65 // b) mmaping the file and jumping around the mapped region
66 // 2) Reading with 1-pass sequential access
67 // -> A reader will need to build up a data structure of parsed structures
68 // as it reads
69 //
70 // -- Writing --
71 // 1) Writing with full random access
72 // 2) Writing with 1-pass sequential access
73 // -> We must take care not to require updating values that have already
74 // been written. We place the variable-length index at the end and do
75 // not put any indicies into the header to fulfill this constraint.
76 
77 // The model.json, which contains all the metadata information,
78 // should be written as the last file. One reason is that the size of tensor data is
79 // usually stable. As long as the shape and type of the tensor do not change,
80 // the size of the data won't change. On the other sied, the size of the
81 // serialized model is likely to change, so we store it as the last record, and
82 // we don't need to move previous records when updating the model data.
83 
84 // The zip format is sufficiently flexible to handle the above use-case.
85 // it puts its central directory at the end of the archive and we write
86 // model.json as the last file when writing after we have accumulated all
87 // other information.
88 
89 namespace caffe2 {
90 namespace serialize {
91 
92 constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
93 constexpr uint64_t kMaxSupportedFileFormatVersion = 0x1L;
94 
95 // Writer-specific constants
96 constexpr uint64_t kFileFormatVersion = 0x2L;
97 
98 // Writer-specific constants
99 constexpr uint64_t kFieldAlignment = 64;
100 
101 class CAFFE2_API PyTorchStreamReader final {
102  public:
103  explicit PyTorchStreamReader(const std::string& file_name);
104  explicit PyTorchStreamReader(std::istream* in);
105  explicit PyTorchStreamReader(std::unique_ptr<ReadAdapterInterface> in);
106 
107  // return dataptr, size
108  std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
109 
110  size_t getRecordOffset(const std::string& name);
111 
113 
114  private:
115  void init();
116  size_t read(uint64_t pos, char* buf, size_t n);
117  void valid(const char* what);
118  size_t getFileID(const std::string& name);
119 
120  friend size_t
121  istream_read_func(void* pOpaque, uint64_t file_ofs, void* pBuf, size_t n);
122  std::unique_ptr<mz_zip_archive> ar_;
123  std::string archive_name_;
124  std::unique_ptr<ReadAdapterInterface> in_;
125 };
126 
127 class CAFFE2_API PyTorchStreamWriter final {
128  public:
129  PyTorchStreamWriter(std::string archive_name, std::ostream* out=nullptr);
130  PyTorchStreamWriter(std::ostream* out)
131  : PyTorchStreamWriter("archive", out) {}
132 
133  void writeRecord(const std::string& name, const void* data, size_t size);
134  void writeEndOfFile();
135 
136  bool finalized() const {
137  return finalized_;
138  }
139 
140  const std::string& archiveName() {
141  return archive_name_;
142  }
143 
145 
146  private:
147  void valid(const char* what);
148  size_t current_pos_ = 0;
149  std::unique_ptr<mz_zip_archive> ar_;
150  std::string archive_name_;
151  std::ostream* out_;
152  std::ofstream file_stream_;
153  bool finalized_ = false;
154  friend size_t ostream_write_func(void *pOpaque, uint64_t file_ofs, const void *pBuf, size_t n);
155 };
156 
157 } // namespace serialize
158 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13