# encoding: utf-8 module CodeRay module Scanners # This scanner is really complex, since Ruby _is_ a complex language! # # It tries to highlight 100% of all common code, # and 90% of strange codes. # # It is optimized for HTML highlighting, and is not very useful for # parsing or pretty printing. # # For now, I think it's better than the scanners in VIM or Syntax, or # any highlighter I was able to find, except Caleb's RubyLexer. # # I hope it's also better than the rdoc/irb lexer. class Ruby < Scanner include Streamable register_for :ruby file_extension 'rb' helper :patterns if not defined? EncodingError EncodingError = Class.new Exception end private def scan_tokens tokens, options if string.respond_to?(:encoding) unless string.encoding == Encoding::UTF_8 self.string = string.encode Encoding::UTF_8, :invalid => :replace, :undef => :replace, :replace => '?' end unicode = false else unicode = exist?(/[^\x00-\x7f]/) end last_token_dot = false value_expected = true heredocs = nil last_state = nil state = :initial depth = nil inline_block_stack = [] patterns = Patterns # avoid constant lookup until eos? match = nil kind = nil if state.instance_of? patterns::StringState # {{{ match = scan_until(state.pattern) || scan_until(/\z/) tokens << [match, :content] unless match.empty? break if eos? if state.heredoc and self[1] # end of heredoc match = getch.to_s match << scan_until(/$/) unless eos? tokens << [match, :delimiter] tokens << [:close, state.type] state = state.next_state next end case match = getch when state.delim if state.paren state.paren_depth -= 1 if state.paren_depth > 0 tokens << [match, :nesting_delimiter] next end end tokens << [match, :delimiter] if state.type == :regexp and not eos? modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) tokens << [modifiers, :modifier] unless modifiers.empty? end tokens << [:close, state.type] value_expected = false state = state.next_state when '\\' if state.interpreted if esc = scan(/ #{patterns::ESCAPE} /ox) tokens << [match + esc, :char] else tokens << [match, :error] end else case m = getch when state.delim, '\\' tokens << [match + m, :char] when nil tokens << [match, :error] else tokens << [match + m, :content] end end when '#' case peek(1) when '{' inline_block_stack << [state, depth, heredocs] value_expected = true state = :initial depth = 1 tokens << [:open, :inline] tokens << [match + getch, :inline_delimiter] when '$', '@' tokens << [match, :escape] last_state = state # scan one token as normal code, then return here state = :initial else raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens end when state.paren state.paren_depth += 1 tokens << [match, :nesting_delimiter] when /#{patterns::REGEXP_SYMBOLS}/ox tokens << [match, :function] else raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens end next # }}} else # {{{ if match = scan(/[ \t\f]+/) kind = :space match << scan(/\s*/) unless eos? || heredocs value_expected = true if match.index(?\n) tokens << [match, kind] next elsif match = scan(/\\?\n/) kind = :space if match == "\n" value_expected = true state = :initial if state == :undef_comma_expected end if heredocs unscan # heredoc scanning needs \n at start state = heredocs.shift tokens << [:open, state.type] heredocs = nil if heredocs.empty? next else match << scan(/\s*/) unless eos? end tokens << [match, kind] next elsif bol? && match = scan(/\#!.*/) tokens << [match, :doctype] next elsif match = scan(/\#.*/) or ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) kind = :comment tokens << [match, kind] next elsif state == :initial # IDENTS # if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo : /#{patterns::METHOD_NAME}/o) if last_token_dot kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end else if value_expected != :expect_colon && scan(/:(?= )/) tokens << [match, :key] match = ':' kind = :operator else kind = patterns::IDENT_KIND[match] if kind == :ident if match[/\A[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) kind = :constant end elsif kind == :reserved state = patterns::DEF_NEW_STATE[match] value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match] end end end value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o) elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o) kind = :ident value_expected = :set if check(unicode ? /#{patterns::VALUE_FOLLOWS}/uo : /#{patterns::VALUE_FOLLOWS}/o) # OPERATORS # elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x) if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ value_expected = :set end last_token_dot = :set if self[1] kind = :operator unless inline_block_stack.empty? case match when '{' depth += 1 when '}' depth -= 1 if depth == 0 # closing brace of inline block reached state, depth, heredocs = inline_block_stack.pop heredocs = nil if heredocs && heredocs.empty? tokens << [match, :inline_delimiter] kind = :inline match = :close end end end elsif match = scan(/ ['"] /mx) tokens << [:open, :string] kind = :delimiter state = patterns::StringState.new :string, match == '"', match # important for streaming elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo : /#{patterns::INSTANCE_VARIABLE}/o) kind = :instance_variable elsif value_expected and match = scan(/\//) tokens << [:open, :regexp] kind = :delimiter interpreted = true state = patterns::StringState.new :regexp, interpreted, match # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o) elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o) kind = self[1] ? :float : :integer elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo : /#{patterns::SYMBOL}/o) case delim = match[1] when ?', ?" tokens << [:open, :symbol] tokens << [':', :symbol] match = delim.chr kind = :delimiter state = patterns::StringState.new :symbol, delim == ?", match else kind = :symbol end elsif match = scan(/ -[>=]? | [+!~^]=? | [*|&]{1,2}=? | >>? /x) value_expected = :set kind = :operator elsif value_expected and match = scan(unicode ? /#{patterns::HEREDOC_OPEN}/uo : /#{patterns::HEREDOC_OPEN}/o) indented = self[1] == '-' quote = self[3] delim = self[quote ? 4 : 2] kind = patterns::QUOTE_TO_TYPE[quote] tokens << [:open, kind] tokens << [match, :delimiter] match = :close heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart ) heredocs ||= [] # create heredocs if empty heredocs << heredoc elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o) kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do raise_inspect 'Unknown fancy string: %%%p' % k, tokens end tokens << [:open, kind] state = patterns::StringState.new kind, interpreted, self[2] kind = :delimiter elsif value_expected and match = scan(unicode ? /#{patterns::CHARACTER}/uo : /#{patterns::CHARACTER}/o) kind = :integer elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) value_expected = :set kind = :operator elsif match = scan(/`/) if last_token_dot kind = :operator else tokens << [:open, :shell] kind = :delimiter state = patterns::StringState.new :shell, true, match end elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo : /#{patterns::GLOBAL_VARIABLE}/o) kind = :global_variable elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo : /#{patterns::CLASS_VARIABLE}/o) kind = :class_variable else if !unicode && !string.respond_to?(:encoding) # check for unicode debug, $DEBUG = $DEBUG, false begin if check(/./mu).size > 1 # seems like we should try again with unicode unicode = true end rescue # bad unicode char; use getch ensure $DEBUG = debug end next if unicode end kind = :error match = scan(unicode ? /./mu : /./m) end elsif state == :def_expected state = :initial if scan(/self\./) tokens << ['self', :pre_constant] tokens << ['.', :operator] end if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) kind = :method else next end elsif state == :module_expected if match = scan(/< 1 state = this_block.first tokens << [:close, state.type] end tokens end end end end # vim:fdm=marker