Spicy
regexp.h
1 // Copyright (c) 2020-2021 by the Zeek Project. See LICENSE for details.
2 
3 #pragma once
4 
5 #include <memory>
6 #include <optional>
7 #include <string>
8 #include <tuple>
9 #include <utility>
10 #include <vector>
11 
12 #include <hilti/rt/extension-points.h>
13 #include <hilti/rt/types/bytes.h>
14 #include <hilti/rt/types/stream.h>
15 #include <hilti/rt/types/vector.h>
16 
17 extern "C" {
18 struct jrx_regex_t;
19 struct jrx_match_state;
20 }
21 
22 namespace hilti::rt {
23 
24 class RegExp;
25 
26 namespace regexp {
27 
28 struct Flags {
29  bool no_sub = false;
30  bool use_std =
31  false;
33  friend bool operator==(const Flags& a, const Flags& b) { return a.no_sub == b.no_sub && a.use_std == b.use_std; }
34  friend bool operator!=(const Flags& a, const Flags& b) { return ! (a == b); }
35 };
36 
37 /* Type for passing around the content of extracted capture groups. */
38 using Captures = Vector<Bytes>;
39 
41 class MatchState {
42 public:
48  MatchState(const RegExp& re);
49  MatchState() noexcept;
50  ~MatchState();
51  MatchState(const MatchState& other);
52  MatchState(MatchState&& /*unused*/) noexcept;
53  MatchState& operator=(const MatchState& other);
54  MatchState& operator=(MatchState&& /*unused*/) noexcept;
55 
74  std::tuple<int32_t, stream::View> advance(const stream::View& data);
75 
98  std::tuple<int32_t, int64_t> advance(const Bytes& data, bool is_final = false);
99 
108  Captures captures(const Stream& data) const;
109 
110 private:
111  // Returns (rc, bytes-consumed). Note that the latter can be negative if
112  // backtracking is required.
113  std::pair<int32_t, int64_t> _advance(const stream::View& data, bool is_final);
114 
115  // PIMPLing here means we have to allocate dynamic memory, which
116  // isn't great for this class. However, without PIMPL we get a new dependency on
117  // 'jrx.h', which isn't great either, so we go with this.
118  class Pimpl;
119  std::unique_ptr<Pimpl> _pimpl;
120 };
121 
122 } // namespace regexp
123 
125 class RegExp {
126 public:
134  RegExp(std::string pattern, regexp::Flags flags = regexp::Flags());
135 
145  RegExp(const std::vector<std::string>& patterns, regexp::Flags flags = regexp::Flags());
146 
147  RegExp() = default;
148 
149  const auto& patterns() const { return _patterns; }
150  const auto& flags() const { return _flags; }
151 
164  int32_t match(const Bytes& data) const;
165 
179  Vector<Bytes> matchGroups(const Bytes& data) const;
180 
191  std::tuple<int32_t, Bytes> find(const Bytes& data) const;
192 
198  regexp::MatchState tokenMatcher() const;
199 
200  friend bool operator==(const RegExp& a, const RegExp& b) {
201  // NOTE: `_jrx_shared` is deliberately not included in the comparison.
202  return a._flags == b._flags && a._patterns == b._patterns;
203  }
204 
205  friend bool operator!=(const RegExp& a, const RegExp& b) { return ! (a == b); }
206 
207 private:
208  friend class regexp::MatchState;
209 
210  jrx_regex_t* _jrx() const {
211  assert(_jrx_shared && "regexp not compiled");
212  return _jrx_shared.get();
213  }
214  const auto& _jrxShared() const { return _jrx_shared; }
215 
216  // Backend for the the searching and matching methods.
217  int16_t _search_pattern(jrx_match_state* ms, const char* data, size_t len, int32_t* so, int32_t* eo) const;
218 
219  void _newJrx();
220  void _compileOne(std::string pattern, int idx);
221 
222  regexp::Flags _flags{};
223  std::vector<std::string> _patterns;
224  std::shared_ptr<jrx_regex_t>
225  _jrx_shared; // Shared ptr so that we can copy by value, and safely share with match state.
226 };
227 
228 namespace detail::adl {
229 extern std::string to_string(const RegExp& x, adl::tag /*unused*/);
230 
231 inline std::string to_string(const regexp::MatchState& /*unused*/, adl::tag /*unused*/) {
232  return "<regexp-match-state>";
233 }
234 
235 } // namespace detail::adl
236 
237 inline std::ostream& operator<<(std::ostream& out, const RegExp& x) {
238  out << to_string(x);
239  return out;
240 }
241 
242 } // namespace hilti::rt
std::string to_string(T &&x)
Definition: extension-points.h:26
Definition: regexp.h:41
Definition: any.h:7
Definition: regexp.h:125
Definition: bytes.h:157
Definition: stream.h:1001
Definition: regexp.h:28
bool no_sub
Definition: regexp.h:29
bool use_std
Definition: regexp.h:30
Definition: stream.h:1407
Definition: regexp.cc:40
Definition: vector.h:251
Definition: elements.cc:17