Caffe2 - C++ API
A deep learning, cross platform ML framework
text_file_reader_utils.cc
1 #include "caffe2/operators/text_file_reader_utils.h"
2 
3 #include <fcntl.h>
4 #include <cerrno>
5 #include <cstring>
6 #include <sstream>
7 
8 namespace caffe2 {
9 
10 Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
11  : escape_(escape) {
12  reset();
13  std::memset(delimTable_, 0, sizeof(delimTable_));
14  for (int i = 0; i < delims.size(); ++i) {
15  delimTable_[(unsigned char)delims.at(i)] = i + 1;
16  }
17 }
18 
19 void Tokenizer::reset() {
20  toBeSkipped_ = 0;
21  startDelimId_ = 0;
22  leftover_.clear();
23 }
24 
25 void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
26  tokenized.modifiedStrings_.clear();
27  tokenized.tokens_.clear();
28 
29  char* currentStart = start;
30  std::string* copied = nullptr;
31  if (!leftover_.empty()) {
32  tokenized.modifiedStrings_.emplace_back(new std::string());
33  copied = tokenized.modifiedStrings_.back().get();
34  *copied = std::move(leftover_);
35  }
36 
37  char* ch;
38  for (ch = start + toBeSkipped_; ch < end; ++ch) {
39  if (*ch == escape_) {
40  if (!copied) {
41  tokenized.modifiedStrings_.emplace_back(new std::string());
42  copied = tokenized.modifiedStrings_.back().get();
43  }
44  copied->append(currentStart, ch);
45  currentStart = ch + 1;
46  // skip next character, since it's escaped
47  ++ch;
48  continue;
49  }
50  int newDelimId = delimTable_[(unsigned char)*ch];
51  if (newDelimId > 0) {
52  // found delimiter
53  tokenized.tokens_.emplace_back();
54  auto& token = tokenized.tokens_.back();
55  token.startDelimId = startDelimId_;
56  if (copied) {
57  copied->append(currentStart, ch);
58  const char* c_str = copied->data();
59  token.start = c_str;
60  token.end = c_str + copied->size();
61  } else {
62  token.start = currentStart;
63  token.end = ch;
64  }
65  currentStart = ch + 1;
66  copied = nullptr;
67  startDelimId_ = newDelimId - 1;
68  }
69  }
70  tokenized.lastDelim_ = startDelimId_;
71 
72  toBeSkipped_ = ch - end;
73  if (copied) {
74  copied->append(currentStart, end);
75  leftover_ = std::move(*copied);
76  } else {
77  leftover_.assign(currentStart, end);
78  }
79 }
80 
81 FileReader::FileReader(const std::string& path, size_t bufferSize)
82  : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
83  fd_ = open(path.c_str(), O_RDONLY, 0777);
84  if (fd_ < 0) {
85  throw std::runtime_error(
86  "Error opening file for reading: " + std::string(std::strerror(errno)) +
87  " Path=" + path);
88  }
89 }
90 
91 void FileReader::reset() {
92  if (lseek(fd_, 0, SEEK_SET) == -1) {
93  throw std::runtime_error(
94  "Error reseting file cursor: " + std::string(std::strerror(errno)));
95  }
96 }
97 
98 FileReader::~FileReader() {
99  if (fd_ >= 0) {
100  close(fd_);
101  }
102 }
103 
104 void FileReader::operator()(CharRange& range) {
105  char* buffer = buffer_.get();
106  auto numRead = read(fd_, buffer, bufferSize_);
107  if (numRead == -1) {
108  throw std::runtime_error(
109  "Error reading file: " + std::string(std::strerror(errno)));
110  }
111  if (numRead == 0) {
112  range.start = nullptr;
113  range.end = nullptr;
114  return;
115  }
116  range.start = buffer;
117  range.end = buffer + numRead;
118 }
119 }
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13