Spicy
regexp.h
1 // Copyright (c) 2020-2021 by the Zeek Project. See LICENSE for details.
2 
3 #pragma once
4 
5 #include <memory>
6 #include <optional>
7 #include <string>
8 #include <tuple>
9 #include <utility>
10 #include <vector>
11 
12 #include <hilti/rt/extension-points.h>
13 #include <hilti/rt/types/bytes.h>
14 #include <hilti/rt/types/stream.h>
15 #include <hilti/rt/types/vector.h>
16 
17 extern "C" {
18 struct jrx_regex_t;
19 struct jrx_match_state;
20 }
21 
22 namespace hilti::rt {
23 
24 class RegExp;
25 
26 namespace regexp {
27 
28 struct Flags {
29  bool no_sub = false;
30  bool use_std =
31  false;
33  friend bool operator==(const Flags& a, const Flags& b) { return a.no_sub == b.no_sub && a.use_std == b.use_std; }
34  friend bool operator!=(const Flags& a, const Flags& b) { return ! (a == b); }
35 };
36 
37 /* Type for passing around the content of extracted capture groups. */
38 using Captures = Vector<Bytes>;
39 
41 class MatchState {
42 public:
48  MatchState(const RegExp& re);
49  MatchState() noexcept;
50  ~MatchState();
51  MatchState(const MatchState& other);
52  MatchState(MatchState&& /*unused*/) noexcept;
53  MatchState& operator=(const MatchState& other);
54  MatchState& operator=(MatchState&& /*unused*/) noexcept;
55 
72  std::tuple<int32_t, stream::View> advance(const stream::View& data);
73 
92  std::tuple<int32_t, uint64_t> advance(const Bytes& data, bool is_final = false);
93 
102  Captures captures(const Stream& data) const;
103 
104 private:
105  std::pair<int32_t, uint64_t> _advance(const stream::View& data, bool is_final);
106 
107  // PIMPLing here means we have to alllocate dynamic memory, which
108  // isn't great for this class. However, without PIMPL we get a new dependency on
109  // 'jrx.h', which isn't great either, so we go with this.
110  class Pimpl;
111  std::unique_ptr<Pimpl> _pimpl;
112 };
113 
114 } // namespace regexp
115 
117 class RegExp {
118 public:
126  RegExp(std::string pattern, regexp::Flags flags = regexp::Flags());
127 
137  RegExp(const std::vector<std::string>& patterns, regexp::Flags flags = regexp::Flags());
138 
139  RegExp() = default;
140 
141  const auto& patterns() const { return _patterns; }
142  const auto& flags() const { return _flags; }
143 
156  int32_t match(const Bytes& data) const;
157 
171  Vector<Bytes> matchGroups(const Bytes& data) const;
172 
183  std::tuple<int32_t, Bytes> find(const Bytes& data) const;
184 
190  regexp::MatchState tokenMatcher() const;
191 
192  friend bool operator==(const RegExp& a, const RegExp& b) {
193  // NOTE: `_jrx_shared` is deliberately not included in the comparison.
194  return a._flags == b._flags && a._patterns == b._patterns;
195  }
196 
197  friend bool operator!=(const RegExp& a, const RegExp& b) { return ! (a == b); }
198 
199 private:
200  friend class regexp::MatchState;
201 
202  jrx_regex_t* _jrx() const {
203  assert(_jrx_shared && "regexp not compiled");
204  return _jrx_shared.get();
205  }
206  const auto& _jrxShared() const { return _jrx_shared; }
207 
208  // Backend for the the searching and matching methods.
209  int16_t _search_pattern(jrx_match_state* ms, const char* data, size_t len, int32_t* so, int32_t* eo) const;
210 
211  void _newJrx();
212  void _compileOne(std::string pattern, int idx);
213 
214  regexp::Flags _flags{};
215  std::vector<std::string> _patterns;
216  std::shared_ptr<jrx_regex_t>
217  _jrx_shared; // Shared ptr so that we can copy by value, and safely share with match state.
218 };
219 
220 namespace detail::adl {
221 extern std::string to_string(const RegExp& x, adl::tag /*unused*/);
222 
223 inline std::string to_string(const regexp::MatchState& /*unused*/, adl::tag /*unused*/) {
224  return "<regexp-match-state>";
225 }
226 
227 } // namespace detail::adl
228 
229 inline std::ostream& operator<<(std::ostream& out, const RegExp& x) {
230  out << to_string(x);
231  return out;
232 }
233 
234 } // namespace hilti::rt
std::string to_string(T &&x)
Definition: extension-points.h:26
Definition: regexp.h:41
Definition: any.h:7
Definition: regexp.h:117
Definition: bytes.h:154
Definition: stream.h:978
Definition: regexp.h:28
bool no_sub
Definition: regexp.h:29
bool use_std
Definition: regexp.h:30
Definition: stream.h:1379
Definition: regexp.cc:40
Definition: vector.h:251
Definition: elements.cc:17