Ruby 3.3.7p123 (2025-01-15 revision be31f993d7fa0219d85f7b3c694d454da4ecc10b)
regexp.c
1#include "prism/regexp.h"
2
6typedef struct {
8 const uint8_t *start;
9
11 const uint8_t *cursor;
12
14 const uint8_t *end;
15
18
21
25
29static void
30pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
31 *parser = (pm_regexp_parser_t) {
32 .start = start,
33 .cursor = start,
34 .end = end,
35 .named_captures = named_captures,
36 .encoding_changed = encoding_changed,
37 .encoding = encoding
38 };
39}
40
44static void
45pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
46 pm_string_t string;
47 pm_string_shared_init(&string, start, end);
48 pm_string_list_append(parser->named_captures, &string);
49 pm_string_free(&string);
50}
51
55static inline bool
56pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
57 return parser->cursor >= parser->end;
58}
59
63static inline bool
64pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
65 if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
66 parser->cursor++;
67 return true;
68 }
69 return false;
70}
71
75static inline bool
76pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
77 if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
78 parser->cursor++;
79 return true;
80 }
81 return false;
82}
83
87static bool
88pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
89 if (pm_regexp_char_is_eof(parser)) {
90 return false;
91 }
92
93 const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
94 if (end == NULL) {
95 return false;
96 }
97
98 parser->cursor = end + 1;
99 return true;
100}
101
135static bool
136pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
137 const uint8_t *savepoint = parser->cursor;
138
139 enum {
140 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
141 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
142 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
143 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
144 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
145
146 while (1) {
147 switch (state) {
148 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
149 switch (*parser->cursor) {
150 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
151 parser->cursor++;
152 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
153 break;
154 case ',':
155 parser->cursor++;
156 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
157 break;
158 default:
159 parser->cursor = savepoint;
160 return true;
161 }
162 break;
163 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
164 switch (*parser->cursor) {
165 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
166 parser->cursor++;
167 break;
168 case ',':
169 parser->cursor++;
170 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
171 break;
172 case '}':
173 parser->cursor++;
174 return true;
175 default:
176 parser->cursor = savepoint;
177 return true;
178 }
179 break;
180 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
181 switch (*parser->cursor) {
182 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
183 parser->cursor++;
184 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
185 break;
186 default:
187 parser->cursor = savepoint;
188 return true;
189 }
190 break;
191 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
192 switch (*parser->cursor) {
193 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
194 parser->cursor++;
195 break;
196 case '}':
197 parser->cursor++;
198 return true;
199 default:
200 parser->cursor = savepoint;
201 return true;
202 }
203 break;
204 }
205 }
206
207 return true;
208}
209
218static bool
219pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220 if (pm_regexp_char_is_eof(parser)) return true;
221
222 switch (*parser->cursor) {
223 case '*':
224 case '+':
225 case '?':
226 parser->cursor++;
227 return true;
228 case '{':
229 parser->cursor++;
230 return pm_regexp_parse_range_quantifier(parser);
231 default:
232 // In this case there is no quantifier.
233 return true;
234 }
235}
236
241static bool
242pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
243 if (!pm_regexp_char_expect(parser, ':')) {
244 return false;
245 }
246
247 pm_regexp_char_accept(parser, '^');
248
249 return (
250 pm_regexp_char_find(parser, ':') &&
251 pm_regexp_char_expect(parser, ']') &&
252 pm_regexp_char_expect(parser, ']')
253 );
254}
255
256// Forward declaration because character sets can be nested.
257static bool
258pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
259
264static bool
265pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
266 pm_regexp_char_accept(parser, '^');
267
268 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
269 switch (*parser->cursor++) {
270 case '[':
271 pm_regexp_parse_lbracket(parser);
272 break;
273 case '\\':
274 if (!pm_regexp_char_is_eof(parser)) {
275 parser->cursor++;
276 }
277 break;
278 default:
279 // do nothing, we've already advanced the cursor
280 break;
281 }
282 }
283
284 return pm_regexp_char_expect(parser, ']');
285}
286
290static bool
291pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
292 const uint8_t *reset = parser->cursor;
293
294 if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
295 parser->cursor++;
296 if (pm_regexp_parse_posix_class(parser)) return true;
297
298 parser->cursor = reset;
299 }
300
301 return pm_regexp_parse_character_set(parser);
302}
303
304// Forward declaration here since parsing groups needs to go back up the grammar
305// to parse expressions within them.
306static bool
307pm_regexp_parse_expression(pm_regexp_parser_t *parser);
308
313typedef enum {
314 PM_REGEXP_OPTION_STATE_INVALID,
315 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
316 PM_REGEXP_OPTION_STATE_ADDABLE,
317 PM_REGEXP_OPTION_STATE_ADDED,
318 PM_REGEXP_OPTION_STATE_REMOVED
319} pm_regexp_option_state_t;
320
321// These are the options that are configurable on the regular expression (or
322// from within a group).
323
324#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
325#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
326#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
327
331typedef struct {
333 uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
335
339static void
340pm_regexp_options_init(pm_regexp_options_t *options) {
341 memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
342 options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
343 options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
344 options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
345 options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
346 options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
347 options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
348}
349
354static bool
355pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
356 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
357 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
358
359 switch (options->values[key]) {
360 case PM_REGEXP_OPTION_STATE_INVALID:
361 case PM_REGEXP_OPTION_STATE_REMOVED:
362 return false;
363 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
364 case PM_REGEXP_OPTION_STATE_ADDABLE:
365 options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
366 return true;
367 case PM_REGEXP_OPTION_STATE_ADDED:
368 return true;
369 }
370 }
371
372 return false;
373}
374
379static bool
380pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
381 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
382 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
383
384 switch (options->values[key]) {
385 case PM_REGEXP_OPTION_STATE_INVALID:
386 case PM_REGEXP_OPTION_STATE_ADDABLE:
387 return false;
388 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
389 case PM_REGEXP_OPTION_STATE_ADDED:
390 case PM_REGEXP_OPTION_STATE_REMOVED:
391 options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
392 return true;
393 }
394 }
395
396 return false;
397}
398
420static bool
421pm_regexp_parse_group(pm_regexp_parser_t *parser) {
422 // First, parse any options for the group.
423 if (pm_regexp_char_accept(parser, '?')) {
424 if (pm_regexp_char_is_eof(parser)) {
425 return false;
426 }
427 pm_regexp_options_t options;
428 pm_regexp_options_init(&options);
429
430 switch (*parser->cursor) {
431 case '#': { // inline comments
432 if (parser->encoding_changed && parser->encoding->multibyte) {
433 bool escaped = false;
434
435 // Here we're going to take a slow path and iterate through
436 // each multibyte character to find the close paren. We do
437 // this because \ can be a trailing byte in some encodings.
438 while (parser->cursor < parser->end) {
439 if (!escaped && *parser->cursor == ')') {
440 parser->cursor++;
441 return true;
442 }
443
444 size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
445 if (width == 0) return false;
446
447 escaped = (width == 1) && (*parser->cursor == '\\');
448 parser->cursor += width;
449 }
450
451 return false;
452 } else {
453 // Here we can take the fast path and use memchr to find the
454 // next ) because we are safe checking backward for \ since
455 // it cannot be a trailing character.
456 bool found = pm_regexp_char_find(parser, ')');
457
458 while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
459 found = pm_regexp_char_find(parser, ')');
460 }
461
462 return found;
463 }
464 }
465 case ':': // non-capturing group
466 case '=': // positive lookahead
467 case '!': // negative lookahead
468 case '>': // atomic group
469 case '~': // absence operator
470 parser->cursor++;
471 break;
472 case '<':
473 parser->cursor++;
474 if (pm_regexp_char_is_eof(parser)) {
475 return false;
476 }
477
478 switch (*parser->cursor) {
479 case '=': // positive lookbehind
480 case '!': // negative lookbehind
481 parser->cursor++;
482 break;
483 default: { // named capture group
484 const uint8_t *start = parser->cursor;
485 if (!pm_regexp_char_find(parser, '>')) {
486 return false;
487 }
488 pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
489 break;
490 }
491 }
492 break;
493 case '\'': { // named capture group
494 const uint8_t *start = ++parser->cursor;
495 if (!pm_regexp_char_find(parser, '\'')) {
496 return false;
497 }
498
499 pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
500 break;
501 }
502 case '(': // conditional expression
503 if (!pm_regexp_char_find(parser, ')')) {
504 return false;
505 }
506 break;
507 case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
508 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
509 if (!pm_regexp_options_add(&options, *parser->cursor)) {
510 return false;
511 }
512 parser->cursor++;
513 }
514
515 if (pm_regexp_char_is_eof(parser)) {
516 return false;
517 }
518
519 // If we hit a -, then we're done parsing options.
520 if (*parser->cursor != '-') break;
521
522 // Otherwise, fallthrough to the - case.
523 /* fallthrough */
524 case '-':
525 parser->cursor++;
526 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
527 if (!pm_regexp_options_remove(&options, *parser->cursor)) {
528 return false;
529 }
530 parser->cursor++;
531 }
532
533 if (pm_regexp_char_is_eof(parser)) {
534 return false;
535 }
536 break;
537 default:
538 return false;
539 }
540 }
541
542 // Now, parse the expressions within this group.
543 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
544 if (!pm_regexp_parse_expression(parser)) {
545 return false;
546 }
547 pm_regexp_char_accept(parser, '|');
548 }
549
550 // Finally, make sure we have a closing parenthesis.
551 return pm_regexp_char_expect(parser, ')');
552}
553
566static bool
567pm_regexp_parse_item(pm_regexp_parser_t *parser) {
568 switch (*parser->cursor++) {
569 case '^':
570 case '$':
571 return true;
572 case '\\':
573 if (!pm_regexp_char_is_eof(parser)) {
574 parser->cursor++;
575 }
576 return pm_regexp_parse_quantifier(parser);
577 case '(':
578 return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
579 case '[':
580 return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
581 default:
582 return pm_regexp_parse_quantifier(parser);
583 }
584}
585
590static bool
591pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
592 if (!pm_regexp_parse_item(parser)) {
593 return false;
594 }
595
596 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
597 if (!pm_regexp_parse_item(parser)) {
598 return false;
599 }
600 }
601
602 return true;
603}
604
611static bool
612pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
613 return (
614 (
615 // Exit early if the pattern is empty.
616 pm_regexp_char_is_eof(parser) ||
617 // Parse the first expression in the pattern.
618 pm_regexp_parse_expression(parser)
619 ) &&
620 (
621 // Return now if we've parsed the entire pattern.
622 pm_regexp_char_is_eof(parser) ||
623 // Otherwise, we should have a pipe character.
624 (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
625 )
626 );
627}
628
634pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
635 pm_regexp_parser_t parser;
636 pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
637 return pm_regexp_parse_pattern(&parser);
638}
#define PRISM_EXPORTED_FUNCTION
By default, we compile with -fvisibility=hidden.
Definition defines.h:32
A regular expression parser.
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition encoding.h:23
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
Definition encoding.h:29
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition encoding.h:61
This is the set of options that are configurable on the regular expression.
Definition regexp.c:331
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
Definition regexp.c:333
This is the parser that is going to handle parsing regular expressions.
Definition regexp.c:6
const uint8_t * cursor
A pointer to the current position in the source.
Definition regexp.c:11
const uint8_t * start
A pointer to the start of the source that we are parsing.
Definition regexp.c:8
const uint8_t * end
A pointer to the end of the source that we are parsing.
Definition regexp.c:14
const pm_encoding_t * encoding
The encoding of the source.
Definition regexp.c:23
pm_string_list_t * named_captures
A list of named captures that we've found.
Definition regexp.c:17
bool encoding_changed
Whether the encoding has changed from the default.
Definition regexp.c:20
A list of strings.
A generic string type that can have various ownership semantics.
Definition pm_string.h:30