1 #include "caffe2/operators/text_file_reader_utils.h" 10 Tokenizer::Tokenizer(
const std::vector<char>& delims,
char escape)
13 std::memset(delimTable_, 0,
sizeof(delimTable_));
14 for (
int i = 0; i < delims.size(); ++i) {
15 delimTable_[(
unsigned char)delims.at(i)] = i + 1;
19 void Tokenizer::reset() {
25 void Tokenizer::next(
char* start,
char* end, TokenizedString& tokenized) {
26 tokenized.modifiedStrings_.clear();
27 tokenized.tokens_.clear();
29 char* currentStart = start;
30 std::string* copied =
nullptr;
31 if (!leftover_.empty()) {
32 tokenized.modifiedStrings_.emplace_back(
new std::string());
33 copied = tokenized.modifiedStrings_.back().get();
34 *copied = std::move(leftover_);
38 for (ch = start + toBeSkipped_; ch < end; ++ch) {
41 tokenized.modifiedStrings_.emplace_back(
new std::string());
42 copied = tokenized.modifiedStrings_.back().get();
44 copied->append(currentStart, ch);
45 currentStart = ch + 1;
50 int newDelimId = delimTable_[(
unsigned char)*ch];
53 tokenized.tokens_.emplace_back();
54 auto& token = tokenized.tokens_.back();
55 token.startDelimId = startDelimId_;
57 copied->append(currentStart, ch);
58 const char* c_str = copied->data();
60 token.end = c_str + copied->size();
62 token.start = currentStart;
65 currentStart = ch + 1;
67 startDelimId_ = newDelimId - 1;
70 tokenized.lastDelim_ = startDelimId_;
72 toBeSkipped_ = ch - end;
74 copied->append(currentStart, end);
75 leftover_ = std::move(*copied);
77 leftover_.assign(currentStart, end);
81 FileReader::FileReader(
const std::string& path,
size_t bufferSize)
82 : bufferSize_(bufferSize), buffer_(new char[bufferSize]) {
83 fd_ = open(path.c_str(), O_RDONLY, 0777);
85 throw std::runtime_error(
86 "Error opening file for reading: " + std::string(std::strerror(errno)) +
91 void FileReader::reset() {
92 if (lseek(fd_, 0, SEEK_SET) == -1) {
93 throw std::runtime_error(
94 "Error reseting file cursor: " + std::string(std::strerror(errno)));
98 FileReader::~FileReader() {
104 void FileReader::operator()(CharRange& range) {
105 char* buffer = buffer_.get();
106 auto numRead = read(fd_, buffer, bufferSize_);
108 throw std::runtime_error(
109 "Error reading file: " + std::string(std::strerror(errno)));
112 range.start =
nullptr;
116 range.start = buffer;
117 range.end = buffer + numRead;
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...