Flex  0.17.9
arrow_utils.h
Go to the documentation of this file.
1 
15 #ifndef UTILS_ARROW_UTILS_H_
16 #define UTILS_ARROW_UTILS_H_
17 
18 #include <arrow/api.h>
19 #include <arrow/util/value_parsing.h>
20 #include <memory>
22 #include "glog/logging.h"
23 
24 namespace gs {
25 
26 // arrow related;
27 
28 class LDBCTimeStampParser : public arrow::TimestampParser {
29  public:
30  LDBCTimeStampParser() = default;
31 
32  ~LDBCTimeStampParser() override {}
33 
34  bool operator()(const char* s, size_t length, arrow::TimeUnit::type out_unit,
35  int64_t* out,
36  bool* out_zone_offset_present = NULLPTR) const override {
37  using seconds_type = std::chrono::duration<arrow::TimestampType::c_type>;
38 
39  // We allow the following zone offset formats:
40  // - (none)
41  // - Z
42  // - [+-]HH(:?MM)?
43  //
44  // We allow the following formats for all units:
45  // - "YYYY-MM-DD"
46  // - "YYYY-MM-DD[ T]hhZ?"
47  // - "YYYY-MM-DD[ T]hh:mmZ?"
48  // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
49  //
50  // We allow the following formats for unit == MILLI, MICRO, or NANO:
51  // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
52  //
53  // We allow the following formats for unit == MICRO, or NANO:
54  // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
55  //
56  // We allow the following formats for unit == NANO:
57  // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
58  //
59  // UTC is always assumed, and the DataType's timezone is ignored.
60  //
61 
62  if (ARROW_PREDICT_FALSE(length < 10))
63  return false;
64 
65  seconds_type seconds_since_epoch;
66 #if defined(ARROW_VERSION) && ARROW_VERSION < 15000000
67  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseYYYY_MM_DD(
68  s, &seconds_since_epoch))) {
69 #else
70  if (ARROW_PREDICT_FALSE(
71  !arrow::internal::ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
72 #endif
73  return false;
74  }
75 
76  if (length == 10) {
77  *out =
78  arrow::util::CastSecondsToUnit(out_unit, seconds_since_epoch.count());
79  return true;
80  }
81 
82  if (ARROW_PREDICT_FALSE(s[10] != ' ') &&
83  ARROW_PREDICT_FALSE(s[10] != 'T')) {
84  return false;
85  }
86 
87  // In the implementation of arrow ISO8601 timestamp parser, the zone offset
88  // is set to true if the input string contains a zone offset. However, we
89  // parse the zone offset here but don't set the boolean flag.
90  // https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373
91  // The reason is that, if we want the zone offset to be set, we need to
92  // to declare the zone offset in the schema and construct TimeStampType with
93  // that offset. However, we just want to parse the timestamp string and
94  // convert it to a timestamp value, we have no assumption of the local time
95  // zone, and we don't require the zone offset to be set in the schema.
96  // Same for following commented code.
97  //-------------------------------------------------------------------------
98  // if (out_zone_offset_present) {
99  // *out_zone_offset_present = false;
100  // }
101  //-------------------------------------------------------------------------
102 
103  seconds_type zone_offset(0);
104  if (s[length - 1] == 'Z') {
105  --length;
106  // if (out_zone_offset_present)
107  // *out_zone_offset_present = true;
108  } else if (s[length - 3] == '+' || s[length - 3] == '-') {
109  // [+-]HH
110  length -= 3;
111  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH(
112  s + length + 1, &zone_offset))) {
113  return false;
114  }
115  if (s[length] == '+')
116  zone_offset *= -1;
117  // if (out_zone_offset_present)
118  // *out_zone_offset_present = true;
119  } else if (s[length - 5] == '+' || s[length - 5] == '-') {
120  // [+-]HHMM
121  length -= 5;
122  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHHMM(
123  s + length + 1, &zone_offset))) {
124  return false;
125  }
126  if (s[length] == '+')
127  zone_offset *= -1;
128  // if (out_zone_offset_present)
129  // *out_zone_offset_present = true;
130  } else if ((s[length - 6] == '+' || s[length - 6] == '-') &&
131  (s[length - 3] == ':')) {
132  // [+-]HH:MM
133  length -= 6;
134  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM(
135  s + length + 1, &zone_offset))) {
136  return false;
137  }
138  if (s[length] == '+')
139  zone_offset *= -1;
140  // if (out_zone_offset_present)
141  // *out_zone_offset_present = true;
142  }
143 
144  seconds_type seconds_since_midnight;
145  switch (length) {
146  case 13: // YYYY-MM-DD[ T]hh
147  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH(
148  s + 11, &seconds_since_midnight))) {
149  return false;
150  }
151  break;
152  case 16: // YYYY-MM-DD[ T]hh:mm
153  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM(
154  s + 11, &seconds_since_midnight))) {
155  return false;
156  }
157  break;
158  case 19: // YYYY-MM-DD[ T]hh:mm:ss
159  case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
160  case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
161  case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
162  case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
163  case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
164  case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
165  case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
166  case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
167  case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
168  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
169  s + 11, &seconds_since_midnight))) {
170  return false;
171  }
172  break;
173  default:
174  LOG(ERROR) << "unsupported length: " << length;
175  return false;
176  }
177 
178  seconds_since_epoch += seconds_since_midnight;
179  seconds_since_epoch += zone_offset;
180 
181  if (length <= 19) {
182  *out =
183  arrow::util::CastSecondsToUnit(out_unit, seconds_since_epoch.count());
184  return true;
185  }
186 
187  if (ARROW_PREDICT_FALSE(s[19] != '.')) {
188  return false;
189  }
190 
191  uint32_t subseconds = 0;
192  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseSubSeconds(
193  s + 20, length - 20, out_unit, &subseconds))) {
194  return false;
195  }
196 
197  *out =
198  arrow::util::CastSecondsToUnit(out_unit, seconds_since_epoch.count()) +
199  subseconds;
200  return true;
201  }
202 
203  const char* kind() const override { return "LDBC timestamp parser"; }
204 
205  const char* format() const override { return "EmptyFormat"; }
206 };
207 
208 class LDBCLongDateParser : public arrow::TimestampParser {
209  public:
210  using seconds_type = std::chrono::duration<arrow::TimestampType::c_type>;
211  LDBCLongDateParser() = default;
212 
213  ~LDBCLongDateParser() override {}
214 
215  bool operator()(const char* s, size_t length, arrow::TimeUnit::type out_unit,
216  int64_t* out,
217  bool* out_zone_offset_present = NULLPTR) const override {
218  uint64_t seconds;
219  // convert (s, s + length - 4) to seconds_since_epoch
220  if (ARROW_PREDICT_FALSE(
221  !arrow::internal::ParseUnsigned(s, length - 3, &seconds))) {
222  return false;
223  }
224 
225  uint32_t subseconds = 0;
226  if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseSubSeconds(
227  s + length - 3, 3, out_unit, &subseconds))) {
228  return false;
229  }
230 
231  *out = arrow::util::CastSecondsToUnit(out_unit, seconds) + subseconds;
232  return true;
233  }
234 
235  const char* kind() const override { return "LDBC timestamp parser"; }
236 
237  const char* format() const override { return "LongDateFormat"; }
238 };
239 
240 // convert c++ type to arrow type. support other types likes emptyType, Date
241 template <typename T>
243 
244 template <>
245 struct TypeConverter<bool> {
247  using ArrowType = arrow::BooleanType;
248  using ArrowArrayType = arrow::BooleanArray;
249  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
250  return arrow::boolean();
251  }
252 };
253 
254 template <>
255 struct TypeConverter<int32_t> {
257  using ArrowType = arrow::Int32Type;
258  using ArrowArrayType = arrow::Int32Array;
259  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
260  return arrow::int32();
261  }
262 };
263 
264 template <>
265 struct TypeConverter<uint32_t> {
267  using ArrowType = arrow::UInt32Type;
268  using ArrowArrayType = arrow::UInt32Array;
269  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
270  return arrow::uint32();
271  }
272 };
273 
274 template <>
275 struct TypeConverter<int64_t> {
277  using ArrowType = arrow::Int64Type;
278  using ArrowArrayType = arrow::Int64Array;
279  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
280  return arrow::int64();
281  }
282 };
283 
284 template <>
285 struct TypeConverter<uint64_t> {
287  using ArrowType = arrow::UInt64Type;
288  using ArrowArrayType = arrow::UInt64Array;
289  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
290  return arrow::uint64();
291  }
292 };
293 
294 template <>
295 struct TypeConverter<double> {
297  using ArrowType = arrow::DoubleType;
298  using ArrowArrayType = arrow::DoubleArray;
299  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
300  return arrow::float64();
301  }
302 };
303 
304 template <>
305 struct TypeConverter<float> {
307  using ArrowType = arrow::FloatType;
308  using ArrowArrayType = arrow::FloatArray;
309  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
310  return arrow::float32();
311  }
312 };
313 template <>
314 struct TypeConverter<std::string> {
316  using ArrowType = arrow::LargeStringType;
317  using ArrowArrayType = arrow::LargeStringArray;
318  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
319  return arrow::large_utf8();
320  }
321 };
322 
323 template <>
324 struct TypeConverter<std::string_view> {
326  using ArrowType = arrow::LargeStringType;
327  using ArrowArrayType = arrow::LargeStringArray;
328  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
329  return arrow::large_utf8();
330  }
331 };
332 
333 template <>
334 struct TypeConverter<Date> {
336  using ArrowType = arrow::TimestampType;
337  using ArrowArrayType = arrow::TimestampArray;
338  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
339  return arrow::timestamp(arrow::TimeUnit::MILLI);
340  }
341 };
342 
343 template <>
346  using ArrowType = arrow::TimestampType;
347  using ArrowArrayType = arrow::TimestampArray;
348  static std::shared_ptr<arrow::DataType> ArrowTypeValue() {
349  return arrow::timestamp(arrow::TimeUnit::MILLI);
350  }
351 };
352 
353 std::shared_ptr<arrow::DataType> PropertyTypeToArrowType(PropertyType type);
354 } // namespace gs
355 
356 #endif // UTILS_ARROW_UTILS_H_
gs::PropertyType::kFloat
static const PropertyType kFloat
Definition: types.h:139
gs::TypeConverter< uint32_t >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:266
gs::LDBCLongDateParser::format
const char * format() const override
Definition: arrow_utils.h:237
gs::LDBCLongDateParser::~LDBCLongDateParser
~LDBCLongDateParser() override
Definition: arrow_utils.h:213
gs::LDBCLongDateParser::seconds_type
std::chrono::duration< arrow::TimestampType::c_type > seconds_type
Definition: arrow_utils.h:210
gs::TypeConverter< std::string_view >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:328
gs::TypeConverter< bool >::ArrowType
arrow::BooleanType ArrowType
Definition: arrow_utils.h:247
gs::TypeConverter< Date >::ArrowArrayType
arrow::TimestampArray ArrowArrayType
Definition: arrow_utils.h:337
gs::Day
Definition: types.h:221
gs::TypeConverter< int32_t >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:256
gs::TypeConverter< bool >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:249
gs::TypeConverter< std::string >::ArrowType
arrow::LargeStringType ArrowType
Definition: arrow_utils.h:316
gs::TypeConverter< double >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:299
types.h
gs::TypeConverter< bool >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:246
gs::PropertyTypeToArrowType
std::shared_ptr< arrow::DataType > PropertyTypeToArrowType(PropertyType type)
Definition: arrow_utils.cc:18
gs::TypeConverter< uint64_t >::ArrowArrayType
arrow::UInt64Array ArrowArrayType
Definition: arrow_utils.h:288
gs::TypeConverter< float >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:309
gs::LDBCTimeStampParser::LDBCTimeStampParser
LDBCTimeStampParser()=default
gs::LDBCTimeStampParser
Definition: arrow_utils.h:28
gs::TypeConverter< uint64_t >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:289
gs::TypeConverter< Day >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:348
gs::TypeConverter< uint64_t >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:286
gs::PropertyType::kDay
static const PropertyType kDay
Definition: types.h:144
gs::TypeConverter< Date >::ArrowType
arrow::TimestampType ArrowType
Definition: arrow_utils.h:336
gs::TypeConverter< double >::ArrowType
arrow::DoubleType ArrowType
Definition: arrow_utils.h:297
gs::PropertyType::kDate
static const PropertyType kDate
Definition: types.h:143
gs
Definition: adj_list.h:23
gs::TypeConverter< bool >::ArrowArrayType
arrow::BooleanArray ArrowArrayType
Definition: arrow_utils.h:248
gs::PropertyType::kUInt64
static const PropertyType kUInt64
Definition: types.h:141
gs::TypeConverter< int32_t >::ArrowType
arrow::Int32Type ArrowType
Definition: arrow_utils.h:257
gs::TypeConverter< float >::ArrowType
arrow::FloatType ArrowType
Definition: arrow_utils.h:307
gs::TypeConverter< float >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:306
gs::TypeConverter< float >::ArrowArrayType
arrow::FloatArray ArrowArrayType
Definition: arrow_utils.h:308
gs::TypeConverter< int32_t >::ArrowArrayType
arrow::Int32Array ArrowArrayType
Definition: arrow_utils.h:258
gs::TypeConverter< std::string_view >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:325
gs::PropertyType::kStringView
static const PropertyType kStringView
Definition: types.h:145
gs::LDBCLongDateParser
Definition: arrow_utils.h:208
gs::TypeConverter
Definition: arrow_utils.h:242
gs::TypeConverter< Date >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:335
gs::LDBCLongDateParser::kind
const char * kind() const override
Definition: arrow_utils.h:235
gs::PropertyType::kUInt32
static const PropertyType kUInt32
Definition: types.h:138
gs::TypeConverter< std::string >::ArrowArrayType
arrow::LargeStringArray ArrowArrayType
Definition: arrow_utils.h:317
gs::PropertyType::kDouble
static const PropertyType kDouble
Definition: types.h:142
gs::LDBCTimeStampParser::format
const char * format() const override
Definition: arrow_utils.h:205
gs::TypeConverter< double >::ArrowArrayType
arrow::DoubleArray ArrowArrayType
Definition: arrow_utils.h:298
gs::TypeConverter< std::string >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:318
gs::TypeConverter< std::string >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:315
gs::PropertyType::kInt64
static const PropertyType kInt64
Definition: types.h:140
gs::TypeConverter< uint32_t >::ArrowType
arrow::UInt32Type ArrowType
Definition: arrow_utils.h:267
gs::TypeConverter< std::string_view >::ArrowArrayType
arrow::LargeStringArray ArrowArrayType
Definition: arrow_utils.h:327
gs::TypeConverter< double >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:296
gs::TypeConverter< int64_t >::ArrowType
arrow::Int64Type ArrowType
Definition: arrow_utils.h:277
gs::LDBCTimeStampParser::~LDBCTimeStampParser
~LDBCTimeStampParser() override
Definition: arrow_utils.h:32
std
Definition: loading_config.h:232
gs::TypeConverter< Day >::ArrowType
arrow::TimestampType ArrowType
Definition: arrow_utils.h:346
gs::TypeConverter< int32_t >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:259
gs::TypeConverter< int64_t >::ArrowArrayType
arrow::Int64Array ArrowArrayType
Definition: arrow_utils.h:278
gs::LDBCTimeStampParser::operator()
bool operator()(const char *s, size_t length, arrow::TimeUnit::type out_unit, int64_t *out, bool *out_zone_offset_present=NULLPTR) const override
Definition: arrow_utils.h:34
gs::TypeConverter< int64_t >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:279
gs::TypeConverter< std::string_view >::ArrowType
arrow::LargeStringType ArrowType
Definition: arrow_utils.h:326
gs::TypeConverter< Day >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:345
gs::PropertyType::kBool
static const PropertyType kBool
Definition: types.h:134
gs::TypeConverter< Day >::ArrowArrayType
arrow::TimestampArray ArrowArrayType
Definition: arrow_utils.h:347
gs::LDBCLongDateParser::operator()
bool operator()(const char *s, size_t length, arrow::TimeUnit::type out_unit, int64_t *out, bool *out_zone_offset_present=NULLPTR) const override
Definition: arrow_utils.h:215
gs::PropertyType
Definition: types.h:95
gs::TypeConverter< uint32_t >::ArrowArrayType
arrow::UInt32Array ArrowArrayType
Definition: arrow_utils.h:268
gs::PropertyType::kInt32
static const PropertyType kInt32
Definition: types.h:137
gs::TypeConverter< uint64_t >::ArrowType
arrow::UInt64Type ArrowType
Definition: arrow_utils.h:287
gs::LDBCTimeStampParser::kind
const char * kind() const override
Definition: arrow_utils.h:203
gs::TypeConverter< int64_t >::property_type
static PropertyType property_type()
Definition: arrow_utils.h:276
gs::TypeConverter< uint32_t >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:269
gs::TypeConverter< Date >::ArrowTypeValue
static std::shared_ptr< arrow::DataType > ArrowTypeValue()
Definition: arrow_utils.h:338