Caffe2 - C++ API
A deep learning, cross platform ML framework
text_file_reader_utils.h
1 #ifndef CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
2 #define CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
3 
4 #include <memory>
5 #include <string>
6 #include <vector>
7 
8 #include "caffe2/core/common.h"
9 
10 namespace caffe2 {
11 
12 struct CAFFE2_API Token {
13  int startDelimId;
14  const char* start;
15  const char* end;
16 };
17 
18 class CAFFE2_API TokenizedString {
19  // holder for strings that have been modified
20  std::vector<std::shared_ptr<std::string>> modifiedStrings_;
21  std::vector<Token> tokens_;
22  int lastDelim_;
23 
24  public:
25  const std::vector<Token>& tokens() const {
26  return tokens_;
27  }
28  int lastDelim() const {
29  return lastDelim_;
30  }
31  friend class Tokenizer;
32 };
33 
34 class CAFFE2_API Tokenizer {
35  private:
36  int startDelimId_;
37  // state of the tokenizer
38  std::string leftover_;
39  // if we need to skip the first characters of the next batch because
40  // e.g. an escape char that was the last character of the last batch.
41  int toBeSkipped_;
42  int delimTable_[256];
43  const char escape_;
44 
45  public:
46  Tokenizer(const std::vector<char>& delimiters, char escape);
47  void reset();
48  void next(char* start, char* end, TokenizedString& tokenized);
49 };
50 
51 struct CAFFE2_API CharRange {
52  char* start;
53  char* end;
54 };
55 
56 struct CAFFE2_API StringProvider {
57  virtual void operator()(CharRange&) = 0;
58  virtual void reset() = 0;
59  virtual ~StringProvider() {}
60 };
61 
62 class CAFFE2_API BufferedTokenizer {
63  public:
64  BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
65  : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
66 
67  bool next(Token& token) {
68  CharRange range;
69  while (tokenIndex_ >= tokenized_.tokens().size()) {
70  range.start = nullptr;
71  while (range.start == nullptr && pass_ < numPasses_) {
72  (*provider_)(range);
73  if (range.start == nullptr) {
74  ++pass_;
75  if (pass_ < numPasses_) {
76  provider_->reset();
77  tokenizer_.reset();
78  }
79  }
80  }
81  if (range.start == nullptr) {
82  return false;
83  }
84  tokenizer_.next(range.start, range.end, tokenized_);
85  tokenIndex_ = 0;
86  }
87  token = tokenized_.tokens()[tokenIndex_++];
88  return true;
89  };
90 
91  int endDelim() const {
92  if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
93  return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
94  }
95  return tokenized_.lastDelim();
96  }
97 
98  private:
99  StringProvider* provider_;
100  Tokenizer tokenizer_;
101  TokenizedString tokenized_;
102  int tokenIndex_;
103  int numPasses_;
104  int pass_{0};
105 };
106 
107 class CAFFE2_API FileReader : public StringProvider {
108  public:
109  explicit FileReader(const std::string& path, size_t bufferSize = 65536);
110  ~FileReader();
111  void operator()(CharRange& range) override;
112  void reset() override;
113 
114  private:
115  const size_t bufferSize_;
116  int fd_;
117  std::unique_ptr<char[]> buffer_;
118 };
119 
120 } // namespace caffe2
121 
122 #endif // CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13