class Regexp::Lexer
A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.
Constants
- CLOSING_TOKENS
- CONDITION_TOKENS
- OPENING_TOKENS
Attributes
Public Class Methods
Source
# File lib/regexp_parser/lexer.rb, line 18 def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block) new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block) end
Also aliased as: scan
Public Instance Methods
Source
# File lib/regexp_parser/lexer.rb, line 73 def emit(token) if block # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block res = block.call(token) tokens << res if collect_tokens else tokens << token end end
Source
# File lib/regexp_parser/lexer.rb, line 22 def lex(input, syntax = nil, options: nil, collect_tokens: true, &block) syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT self.block = block self.collect_tokens = collect_tokens self.tokens = [] self.prev_token = nil self.preprev_token = nil self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if (last = prev_token) && type == :quantifier && ( (last.type == :literal && (parts = break_literal(last))) || (last.token == :codepoint_list && (parts = break_codepoint_list(last))) ) emit(parts[0]) last = parts[1] end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) if type == :conditional && CONDITION_TOKENS.include?(token) current = merge_condition(current, last) elsif last last.next = current current.previous = last emit(last) end self.preprev_token = last self.prev_token = current descend(type, token) end emit(prev_token) if prev_token collect_tokens ? tokens : nil end
Private Instance Methods
Source
# File lib/regexp_parser/lexer.rb, line 93 def ascend(type, token) return unless CLOSING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting - 1 when :set self.set_nesting = set_nesting - 1 when :conditional self.conditional_nesting = conditional_nesting - 1 else raise "unhandled nesting type #{type}" end end
Source
# File lib/regexp_parser/lexer.rb, line 145 def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
if a codepoint list is followed by a quantifier, that quantifier applies to the last codepoint, e.g. /u{61 62 63}{3}/ =~ ‘abccc’ c.f. break_literal.
Source
# File lib/regexp_parser/lexer.rb, line 125 def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? token_1 = Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier
Source
# File lib/regexp_parser/lexer.rb, line 108 def descend(type, token) return unless OPENING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting + 1 when :set self.set_nesting = set_nesting + 1 when :conditional self.conditional_nesting = conditional_nesting + 1 else raise "unhandled nesting type #{type}" end end
Source
# File lib/regexp_parser/lexer.rb, line 164 def merge_condition(current, last) token = Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) token.previous = preprev_token # .next will be set by #lex token end