use QRegex; use NQPHLL; use QAST; class QRegex::P6Regex::World is HLL::World { method create_code($ast, $name) { # Create a fresh stub code, and set its name. my $dummy := nqp::freshcoderef(-> { nqp::die("Uncompiled code executed") }); nqp::setcodename($dummy, $name); # Tag it as a static code ref and add it to the root code refs set. nqp::markcodestatic($dummy); self.add_root_code_ref($dummy, $ast); # Create code object. my $code_obj := nqp::create(NQPRegex); nqp::bindattr($code_obj, NQPRegex, '$!do', $dummy); my $slot := self.add_object($code_obj); # Add fixup of the code object and the $!do attribute. my $fixups := QAST::Stmt.new(); $fixups.push(QAST::Op.new( :op('bindattr'), QAST::WVal.new( :value($code_obj) ), QAST::WVal.new( :value(NQPRegex) ), QAST::SVal.new( :value('$!do') ), QAST::BVal.new( :value($ast) ) )); $fixups.push(QAST::Op.new( :op('setcodeobj'), QAST::BVal.new( :value($ast) ), QAST::WVal.new( :value($code_obj) ) )); self.add_fixup_task(:fixup_ast($fixups)); $code_obj } } grammar QRegex::P6Regex::Grammar is HLL::Grammar { method obs ($old, $new, $when = ' in Perl 6') { self.panic('Unsupported use of ' ~ ~$old ~ ';' ~ ~$when ~ ' please use ' ~ ~$new); } # errors are reported through methods, so that subclasses like Rakudo's # Perl6::RegexGrammar can override them, and throw language-specific # exceptions method throw_unrecognized_metachar ($char) { self.panic('Unrecognized regex metacharacter ' ~ $char ~ ' (must be quoted to match literally)'); } method throw_malformed_range() { self.panic('Malformed range.'); } method throw_confused() { self.panic('Confused.'); } method throw_unspace($char) { self.panic: "No unspace allowed in regex; " ~ " if you meant to match the literal character," ~ " please enclose in single quotes ('" ~ $char ~ "') or use a backslashed form like \\x" ~ nqp::sprintf('%02x', [nqp::ord($char)]); } method throw_regex_not_terminated() { self.panic('Regex not terminated.'); } method throw_spaces_in_bare_range() { self.panic('Spaces not allowed in bare range.'); } method throw_unecessary_upto_inf() { self.panic('Unecessary use of "** ^*" quantifier. Did you mean to use the "*" quantifier'); } method throw_solitary_quantifier() { self.panic('Quantifier quantifies nothing.'); } method throw_non_quantifiable() { self.panic('Can only quantify a construct that produces a match'); } method throw_solitary_backtrack_control() { self.panic("Backtrack control ':' does not seem to have a preceding atom to control"); } method throw_null_pattern() { self.panic('Null regex not allowed'); } method worry(*@args) { note(nqp::join('', @args) ~ "\n"); } token ws { [ \s | '#' \N* ]* } token normspace { <.ws> } token identifier { <.ident> [ <[\-']> <.ident> ]* } token arg { [ | | | $=[\d+] ] } rule arglist { '' +% [',' ] } my $cur_handle := 0; token TOP { :my %*RX; :my $handle := '__QREGEX_P6REGEX__' ~ $cur_handle++; :my $*W := QRegex::P6Regex::World.new(:$handle); [ $ || <.throw_confused> ] } token nibbler { :my $OLDRX := nqp::getlexdyn('%*RX'); :my %*RX; :my $*SEQ := 0; { for $OLDRX { %*RX{$_.key} := $_.value; } } <.ws> [ || || $$ <.throw_regex_not_terminated> || (\W) { self.throw_unrecognized_metachar: ~$/[0] } || <.throw_regex_not_terminated> ] } regex infixstopper { :dba('infix stopper') [ | > | ' <.-[>]> > | ] } token rxstopper { $ } # XXX Eventually squish termseq and termish and # get < || && | & > infixes using by EXPR in nibbler token termseq { } token termaltseq { [ '||' { $*SEQ := 1; } <.ws> ]? [ '||' <.ws> { $*SEQ := 1; } ]* } token termconjseq { [ '&&' { $*SEQ := 0; } <.ws> ]? [ '&&' <.ws> { $*SEQ := 0; } ]* } token termalt { [ '|' <.ws> ]? [ '|' <.ws> { $*SEQ := 0; } ]* } token termconj { [ '&' <.ws> ]? [ '&' <.ws> { $*SEQ := 0; } ]* } token termish { :my $*SIGOK := 0; :my $*VARDEF := 0; [ || + || | <.[&|~]> > <.throw_null_pattern> || > <.throw_null_pattern> # XXX Check if unmatched bracket || $$ <.throw_regex_not_terminated> || (\W) { self.throw_unrecognized_metachar: ~$/[0] } || <.throw_regex_not_terminated> ] } method SIGOK() { $*SIGOK := %*RX; self } token quantified_atom { [ || ? [ | | ] [ <.SIGOK> ]? [ <.ws> ]? || [ ]? ] { $*SIGOK := 0 } } rule separator { $=['%''%'?] :my $*VARDEF := 0; :my $*SIGOK := 0; } token atom { # :dba('regex atom') [ | \w [ > > || $*HAS_GOAL }> <.worry("Space is not significant here; please use quotes or :s (:sigspace) modifier (or, to suppress this warning, omit the space, or otherwise change the spacing)")> ]? <.SIGOK> | ] } proto token sigmaybe { <...> } token sigmaybe:sym { } token sigmaybe:sym { } proto token quantifier { <...> } token quantifier:sym<%> { ('%''%'?) { $/.panic("Missing quantifier on the left argument of " ~ $/[0]); } } token quantifier:sym<*> { } token quantifier:sym<+> { } token quantifier:sym { } token quantifier:sym<{N,M}> { {} '{' (\d+) (','?) (\d*) '}' <.obs: '{N,M} as general quantifier', '** N..M (or ** N..*)'> } token quantifier:sym<**> { # 10 | 1..10 | 1^..10 | 1^..^10 | 1..^10 | ^10 | 1..* | 1^..* <.normspace>? <.normspace>? [ | \s+ '..' <.throw_spaces_in_bare_range> | '^' '*' <.throw_unecessary_upto_inf> | $='^' | [ | [ [ | $='^' [ '..' | <.throw_malformed_range> ] | '..' ] [ | $='^'? { $/.panic("Negative numbers are not allowed as quantifiers") if nqp::radix(10, $, 0, 0)[0] < 0; } | $=['*'] | <.throw_malformed_range> ] ]? ] { $/.panic("Negative numbers are not allowed as quantifiers") if nqp::radix(10, $, 0, 0)[0] < 0 } | ] } token codeblock { } token backmod { ':'? [ '?' | '!' | ] } proto token metachar { <...> } token metachar:sym<[ ]> { '[' ~ ']' <.SIGOK> } token metachar:sym<( )> { '(' ~ ')' <.SIGOK> } token metachar:sym<'> { <.SIGOK> } token metachar:sym<"> { <.SIGOK> } token metachar:sym<.> { <.SIGOK> } token metachar:sym<^> { <.SIGOK> } token metachar:sym<^^> { <.SIGOK> } token metachar:sym<$> { <.SIGOK> } token metachar:sym<$$> { <.SIGOK> } token metachar:sym<:::> { <.panic: '::: not yet implemented'> } token metachar:sym<::> { <.panic: ':: not yet implemented'> } token metachar:sym { $=['<<'|'«'] <.SIGOK> } token metachar:sym { $=['>>'|'»'] <.SIGOK> } token metachar:sym { '<(' <.SIGOK> } token metachar:sym { ')>' <.SIGOK> } token metachar:sym { \\ <.SIGOK> } token metachar:sym { } token metachar:sym { <.throw_solitary_quantifier> } ## we cheat here, really should be regex_infix:sym<~> token metachar:sym<~> { :my $*HAS_GOAL := 1; <.ws> <.ws> } token metachar:sym<{*}> { [ \h* '#= ' \h* $=[\S+ [\h+ \S+]*] ]**0..1 } token metachar:sym { '<' ~ '>' <.SIGOK> } token sigil { <[$@%&]> } token metachar:sym { [ | $=['@'] '<' $=[<-[>]>+] '>' | '$<' $=[<-[>]>+] '>' | '$' $=[\d+] ] [ <.ws> '=' <.ws> { $*VARDEF := 1 } { $*VARDEF := 0 } ]**0..1 <.SIGOK> } token metachar:sym<:> { <.throw_solitary_backtrack_control> } proto token backslash { <...> } token backslash:sym { $=[<[dDnNsSwW]>] } token backslash:sym { $=[<[eE]>] } token backslash:sym { $=[<[fF]>] } token backslash:sym { $=[<[hH]>] } token backslash:sym { $=[<[rR]>] } token backslash:sym { $=[<[tT]>] } token backslash:sym { $=[<[vV]>] } token backslash:sym { $=[<[oO]>] [ | '[' ']' ] } token backslash:sym { $=[<[xX]>] [ | '[' ']' ] } token backslash:sym { $=[<[cC]>] } token backslash:sym<0> { $=['0'] } token backslash:sym { 'B' <.obs: '\\B', ' for negated word boundary. If you meant a negated' ~ ' backspace character, use it in a negated character class (<-[\b]>).' >} token backslash:sym { 'b' <.obs: '\\b', '<|w> for word boundary (or « and » for left/right boundaries).' ~ ' If you meant the backspace character, quote it ("\b") or use it as' ~ ' inside a character class (<[\b]>)' >} token backslash:sym { 'K' <.obs: '\\K', '<( for discarding text before the capture marker or )> for discarding text after.' >} token backslash:sym { 'A' <.obs: '\\A as beginning-of-string matcher', '^'> } token backslash:sym { 'z' <.obs: '\\z as end-of-string matcher', '$'> } token backslash:sym { 'Z' <.obs: '\\Z as end-of-string matcher', '\\n?$'> } token backslash:sym { 'Q' <.obs: '\\Q as quotemeta', 'quotes or literal variable match'> } token backslash:sym { {} (\w) { self.throw_unrecog_backslash_seq: $/[0].Str } } token backslash:sym { \s {} <.throw_unspace(~$/)> } token backslash:sym { \W } proto token cclass_backslash { <...> } token cclass_backslash:sym { $=[<[dDnNsSwW]>] } token cclass_backslash:sym { $=[<[bB]>] } token cclass_backslash:sym { $=[<[eE]>] } token cclass_backslash:sym { $=[<[fF]>] } token cclass_backslash:sym { $=[<[hH]>] } token cclass_backslash:sym { $=[<[nN]>] } token cclass_backslash:sym { $=[<[rR]>] } token cclass_backslash:sym { $=[<[tT]>] } token cclass_backslash:sym { $=[<[vV]>] } token cclass_backslash:sym { $=[<[oO]>] [ | '[' ']' ] } token cclass_backslash:sym { $=[<[xX]>] [ | '[' ']' ] } token cclass_backslash:sym { $=[<[cC]>] } token cclass_backslash:sym<0> { $=['0'] } token cclass_backslash:sym { . } proto token assertion { <...> } token assertion:sym { '?' [ ' > | ] } token assertion:sym { '!' [ ' > | ] } token assertion:sym<|> { '|' } token assertion:sym { '.' } token assertion:sym { [ | '> | '=' | ':' | '(' ')' | <.normspace> ]? } token assertion:sym<[> { + } token cclass_elem { :my $*key; $=['+'|'-'|] <.normspace>? [ | '[' $=( || [ \s* '-' ] <.obs: '- as character range','.. for range, for explicit - in character class, escape it or place it as the first or last thing'> || \s* ( '\\' || (<-[\]\\]>) ) [ \s* '..' \s* ( '\\' || (<-[\]\\]>) ) ]**0..1 )* { my %seen; for $ { %seen{$_[0][0]} := (%seen{$_[0][0]} // 0) + 1 if nqp::defined($_[0][0]); %seen{$_[1][0]} := (%seen{$_[1][0]} // 0) + 1 if nqp::defined($_[1][0]); } for %seen { next if $_.value < 2; self.worry("Quotes are not metacharacters in character classes") if $_.key eq '"' || $_.key eq "'"; self.worry("Repeated character (" ~ $_.key ~ ") unexpectedly found in character class"); } } \s* ']' | $= | ':' $=['!'|] { $*key := $.Str } [ || || ] ] <.normspace>? } token mod_internal { ':' [ | $=('!')**1 » | $=(\d+)**1 » | [ '(' [ | $=[\d+] | | ] ')' ]**0..1 ] { if !$ { my $n := $[0] gt '' ?? ($[0] eq '!' ?? 0 !! +$[0]) !! 1; %*RX{ ~$ } := $n; } } } proto token mod_ident { <...> } token mod_ident:sym { $=[i] 'gnorecase'? » } token mod_ident:sym { [ | $=[m] | 'ignore' $=[m] 'ark' ] » } token mod_ident:sym { $=[r] 'atchet'? » } token mod_ident:sym { $=[s] 'igspace'? » } token mod_ident:sym { » } token mod_ident:sym { {} (\w+) { self.throw_unrecognized_regex_modifier($/[0].Str) } } method throw_unrecognized_regex_modifier($mod) { self.panic('Unrecognized regex modifier :' ~ $mod); } }