Caffe2 - C++ API
A deep learning, cross platform ML framework
text_file_reader_utils.h
1 
17 #ifndef CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
18 #define CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
19 
20 #include <memory>
21 #include <string>
22 #include <vector>
23 
24 #include "caffe2/core/common.h"
25 
26 namespace caffe2 {
27 
28 struct Token {
29  int startDelimId;
30  const char* start;
31  const char* end;
32 };
33 
35  // holder for strings that have been modified
36  std::vector<std::unique_ptr<std::string>> modifiedStrings_;
37  std::vector<Token> tokens_;
38  int lastDelim_;
39 
40  public:
41  const std::vector<Token>& tokens() const {
42  return tokens_;
43  }
44  int lastDelim() const {
45  return lastDelim_;
46  }
47  friend class Tokenizer;
48 };
49 
50 class Tokenizer {
51  private:
52  int startDelimId_;
53  // state of the tokenizer
54  std::string leftover_;
55  // if we need to skip the first characters of the next batch because
56  // e.g. an escape char that was the last character of the last batch.
57  int toBeSkipped_;
58  int delimTable_[256];
59  const char escape_;
60 
61  public:
62  Tokenizer(const std::vector<char>& delimiters, char escape);
63  void reset();
64  void next(char* start, char* end, TokenizedString& tokenized);
65 };
66 
67 struct CharRange {
68  char* start;
69  char* end;
70 };
71 
73  virtual void operator()(CharRange&) = 0;
74  virtual void reset() = 0;
75  virtual ~StringProvider() {}
76 };
77 
79  public:
80  BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
81  : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
82 
83  bool next(Token& token) {
84  CharRange range;
85  while (tokenIndex_ >= tokenized_.tokens().size()) {
86  range.start = nullptr;
87  while (range.start == nullptr && pass_ < numPasses_) {
88  (*provider_)(range);
89  if (range.start == nullptr) {
90  ++pass_;
91  if (pass_ < numPasses_) {
92  provider_->reset();
93  tokenizer_.reset();
94  }
95  }
96  }
97  if (range.start == nullptr) {
98  return false;
99  }
100  tokenizer_.next(range.start, range.end, tokenized_);
101  tokenIndex_ = 0;
102  }
103  token = tokenized_.tokens()[tokenIndex_++];
104  return true;
105  };
106 
107  int endDelim() const {
108  if (tokenIndex_ + 1 < tokenized_.tokens().size()) {
109  return tokenized_.tokens()[tokenIndex_ + 1].startDelimId;
110  }
111  return tokenized_.lastDelim();
112  }
113 
114  private:
115  StringProvider* provider_;
116  Tokenizer tokenizer_;
117  TokenizedString tokenized_;
118  int tokenIndex_;
119  int numPasses_;
120  int pass_{0};
121 };
122 
123 class FileReader : public StringProvider {
124  public:
125  explicit FileReader(const std::string& path, size_t bufferSize = 65536);
126  ~FileReader();
127  void operator()(CharRange& range) override;
128  void reset() override;
129 
130  private:
131  const size_t bufferSize_;
132  int fd_;
133  std::unique_ptr<char[]> buffer_;
134 };
135 
136 } // namespace caffe2
137 
138 #endif // CAFFE2_OPERATORS_TEXT_FILE_READER_UTILS_H
Copyright (c) 2016-present, Facebook, Inc.