35 .named_captures = named_captures,
36 .encoding_changed = encoding_changed,
45pm_regexp_parser_named_capture(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end) {
47 pm_string_shared_init(&
string, start, end);
49 pm_string_free(&
string);
65 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
77 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
89 if (pm_regexp_char_is_eof(parser)) {
137 const uint8_t *savepoint = parser->
cursor;
140 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
141 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
142 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
143 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
144 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
148 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
149 switch (*parser->
cursor) {
150 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
152 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
156 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
159 parser->
cursor = savepoint;
163 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
164 switch (*parser->
cursor) {
165 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
170 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
176 parser->
cursor = savepoint;
180 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
181 switch (*parser->
cursor) {
182 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
184 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
187 parser->
cursor = savepoint;
191 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
192 switch (*parser->
cursor) {
193 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
200 parser->
cursor = savepoint;
220 if (pm_regexp_char_is_eof(parser))
return true;
222 switch (*parser->
cursor) {
230 return pm_regexp_parse_range_quantifier(parser);
243 if (!pm_regexp_char_expect(parser,
':')) {
247 pm_regexp_char_accept(parser,
'^');
250 pm_regexp_char_find(parser,
':') &&
251 pm_regexp_char_expect(parser,
']') &&
252 pm_regexp_char_expect(parser,
']')
266 pm_regexp_char_accept(parser,
'^');
268 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
']') {
269 switch (*parser->
cursor++) {
271 pm_regexp_parse_lbracket(parser);
274 if (!pm_regexp_char_is_eof(parser)) {
284 return pm_regexp_char_expect(parser,
']');
292 const uint8_t *reset = parser->
cursor;
296 if (pm_regexp_parse_posix_class(parser))
return true;
301 return pm_regexp_parse_character_set(parser);
314 PM_REGEXP_OPTION_STATE_INVALID,
315 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
316 PM_REGEXP_OPTION_STATE_ADDABLE,
317 PM_REGEXP_OPTION_STATE_ADDED,
318 PM_REGEXP_OPTION_STATE_REMOVED
319} pm_regexp_option_state_t;
324#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
325#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
326#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
333 uint8_t
values[PRISM_REGEXP_OPTION_STATE_SLOTS];
341 memset(options, PM_REGEXP_OPTION_STATE_INVALID,
sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
342 options->
values[
'i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
343 options->
values[
'm' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
344 options->
values[
'x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
345 options->
values[
'd' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
346 options->
values[
'a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
347 options->
values[
'u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
356 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
357 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
359 switch (options->
values[key]) {
360 case PM_REGEXP_OPTION_STATE_INVALID:
361 case PM_REGEXP_OPTION_STATE_REMOVED:
363 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
364 case PM_REGEXP_OPTION_STATE_ADDABLE:
365 options->
values[key] = PM_REGEXP_OPTION_STATE_ADDED;
367 case PM_REGEXP_OPTION_STATE_ADDED:
381 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
382 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
384 switch (options->
values[key]) {
385 case PM_REGEXP_OPTION_STATE_INVALID:
386 case PM_REGEXP_OPTION_STATE_ADDABLE:
388 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
389 case PM_REGEXP_OPTION_STATE_ADDED:
390 case PM_REGEXP_OPTION_STATE_REMOVED:
391 options->
values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
423 if (pm_regexp_char_accept(parser,
'?')) {
424 if (pm_regexp_char_is_eof(parser)) {
428 pm_regexp_options_init(&options);
430 switch (*parser->
cursor) {
433 bool escaped =
false;
439 if (!escaped && *parser->
cursor ==
')') {
445 if (width == 0)
return false;
447 escaped = (width == 1) && (*parser->
cursor ==
'\\');
456 bool found = pm_regexp_char_find(parser,
')');
458 while (found && (parser->
start <= parser->
cursor - 2) && (*(parser->
cursor - 2) ==
'\\')) {
459 found = pm_regexp_char_find(parser,
')');
474 if (pm_regexp_char_is_eof(parser)) {
478 switch (*parser->
cursor) {
484 const uint8_t *start = parser->
cursor;
485 if (!pm_regexp_char_find(parser,
'>')) {
488 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
494 const uint8_t *start = ++parser->
cursor;
495 if (!pm_regexp_char_find(parser,
'\'')) {
499 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
503 if (!pm_regexp_char_find(parser,
')')) {
507 case 'i':
case 'm':
case 'x':
case 'd':
case 'a':
case 'u':
508 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'-' && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
509 if (!pm_regexp_options_add(&options, *parser->
cursor)) {
515 if (pm_regexp_char_is_eof(parser)) {
520 if (*parser->
cursor !=
'-')
break;
526 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
527 if (!pm_regexp_options_remove(&options, *parser->
cursor)) {
533 if (pm_regexp_char_is_eof(parser)) {
543 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')') {
544 if (!pm_regexp_parse_expression(parser)) {
547 pm_regexp_char_accept(parser,
'|');
551 return pm_regexp_char_expect(parser,
')');
568 switch (*parser->
cursor++) {
573 if (!pm_regexp_char_is_eof(parser)) {
576 return pm_regexp_parse_quantifier(parser);
578 return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
580 return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
582 return pm_regexp_parse_quantifier(parser);
592 if (!pm_regexp_parse_item(parser)) {
596 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')' && *parser->
cursor !=
'|') {
597 if (!pm_regexp_parse_item(parser)) {
616 pm_regexp_char_is_eof(parser) ||
618 pm_regexp_parse_expression(parser)
622 pm_regexp_char_is_eof(parser) ||
624 (pm_regexp_char_expect(parser,
'|') && pm_regexp_parse_pattern(parser))
634pm_regexp_named_capture_group_names(
const uint8_t *source,
size_t size,
pm_string_list_t *named_captures,
bool encoding_changed,
const pm_encoding_t *encoding) {
636 pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
637 return pm_regexp_parse_pattern(&parser);
#define PRISM_EXPORTED_FUNCTION
By default, we compile with -fvisibility=hidden.
A regular expression parser.
This struct defines the functions necessary to implement the encoding interface so we can determine h...
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
bool multibyte
Return true if the encoding is a multibyte encoding.
This is the set of options that are configurable on the regular expression.
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
This is the parser that is going to handle parsing regular expressions.
const uint8_t * cursor
A pointer to the current position in the source.
const uint8_t * start
A pointer to the start of the source that we are parsing.
const uint8_t * end
A pointer to the end of the source that we are parsing.
const pm_encoding_t * encoding
The encoding of the source.
pm_string_list_t * named_captures
A list of named captures that we've found.
bool encoding_changed
Whether the encoding has changed from the default.
A generic string type that can have various ownership semantics.