Caffe2 - C++ API
A deep learning, cross platform ML framework
text_file_reader_utils.cc
1 
17 #include "caffe2/operators/text_file_reader_utils.h"
18 
19 #include <fcntl.h>
20 #include <cerrno>
21 #include <cstring>
22 #include <sstream>
23 
24 namespace caffe2 {
25 
26 Tokenizer::Tokenizer(const std::vector<char>& delims, char escape)
27  : escape_(escape) {
28  reset();
29  std::memset(delimTable_, 0, sizeof(delimTable_));
30  for (int i = 0; i < delims.size(); ++i) {
31  delimTable_[(unsigned char)delims.at(i)] = i + 1;
32  }
33 }
34 
35 void Tokenizer::reset() {
36  toBeSkipped_ = 0;
37  startDelimId_ = 0;
38  leftover_.clear();
39 }
40 
41 void Tokenizer::next(char* start, char* end, TokenizedString& tokenized) {
42  tokenized.modifiedStrings_.clear();
43  tokenized.tokens_.clear();
44 
45  char* currentStart = start;
46  std::string* copied = nullptr;
47  if (!leftover_.empty()) {
48  tokenized.modifiedStrings_.emplace_back(new std::string());
49  copied = tokenized.modifiedStrings_.back().get();
50  *copied = std::move(leftover_);
51  }
52 
53  char* ch;
54  for (ch = start + toBeSkipped_; ch < end; ++ch) {
55  if (*ch == escape_) {
56  if (!copied) {
57  tokenized.modifiedStrings_.emplace_back(new std::string());
58  copied = tokenized.modifiedStrings_.back().get();
59  }
60  copied->append(currentStart, ch);
61  currentStart = ch + 1;
62  // skip next character, since it's escaped
63  ++ch;
64  continue;
65  }
66  int newDelimId = delimTable_[(unsigned char)*ch];
67  if (newDelimId > 0) {
68  // found delimiter
69  tokenized.tokens_.emplace_back();
70  auto& token = tokenized.tokens_.back();
71  token.startDelimId = startDelimId_;
72  if (copied) {
73  copied->append(currentStart, ch);
74  const char* c_str = copied->data();
75  token.start = c_str;
76  token.end = c_str + copied->size();
77  } else {
78  token.start = currentStart;
79  token.end = ch;
80  }
81  currentStart = ch + 1;
82  copied = nullptr;
83  startDelimId_ = newDelimId - 1;
84  }
85  }
86  tokenized.lastDelim_ = startDelimId_;
87 
88  toBeSkipped_ = ch - end;
89  if (copied) {
90  copied->append(currentStart, end);
91  leftover_ = std::move(*copied);
92  } else {
93  leftover_.assign(currentStart, end);
94  }
95 }
96 
97 FileReader::FileReader(const std::string& path, size_t bufferSize)
98  : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
99  fd_ = open(path.c_str(), O_RDONLY, 0777);
100  if (fd_ < 0) {
101  throw std::runtime_error(
102  "Error opening file for reading: " + std::string(std::strerror(errno)) +
103  " Path=" + path);
104  }
105 }
106 
107 void FileReader::reset() {
108  if (lseek(fd_, 0, SEEK_SET) == -1) {
109  throw std::runtime_error(
110  "Error reseting file cursor: " + std::string(std::strerror(errno)));
111  }
112 }
113 
114 FileReader::~FileReader() {
115  if (fd_ >= 0) {
116  close(fd_);
117  }
118 }
119 
120 void FileReader::operator()(CharRange& range) {
121  char* buffer = buffer_.get();
122  auto numRead = read(fd_, buffer, bufferSize_);
123  if (numRead == -1) {
124  throw std::runtime_error(
125  "Error reading file: " + std::string(std::strerror(errno)));
126  }
127  if (numRead == 0) {
128  range.start = nullptr;
129  range.end = nullptr;
130  return;
131  }
132  range.start = buffer;
133  range.end = buffer + numRead;
134 }
135 }
Copyright (c) 2016-present, Facebook, Inc.