module CodeRay module Scanners # Bases on pygments' PythonLexer, see # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py. class Python < Scanner include Streamable register_for :python file_extension 'py' KEYWORDS = [ 'and', 'as', 'assert', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield', 'nonlocal', # new in Python 3 ] OLD_KEYWORDS = [ 'exec', 'print', # gone in Python 3 ] PREDEFINED_METHODS_AND_TYPES = %w[ __import__ abs all any apply basestring bin bool buffer bytearray bytes callable chr classmethod cmp coerce compile complex delattr dict dir divmod enumerate eval execfile exit file filter float frozenset getattr globals hasattr hash hex id input int intern isinstance issubclass iter len list locals long map max min next object oct open ord pow property range raw_input reduce reload repr reversed round set setattr slice sorted staticmethod str sum super tuple type unichr unicode vars xrange zip ] PREDEFINED_EXCEPTIONS = %w[ ArithmeticError AssertionError AttributeError BaseException DeprecationWarning EOFError EnvironmentError Exception FloatingPointError FutureWarning GeneratorExit IOError ImportError ImportWarning IndentationError IndexError KeyError KeyboardInterrupt LookupError MemoryError NameError NotImplemented NotImplementedError OSError OverflowError OverflowWarning PendingDeprecationWarning ReferenceError RuntimeError RuntimeWarning StandardError StopIteration SyntaxError SyntaxWarning SystemError SystemExit TabError TypeError UnboundLocalError UnicodeDecodeError UnicodeEncodeError UnicodeError UnicodeTranslateError UnicodeWarning UserWarning ValueError Warning ZeroDivisionError ] PREDEFINED_VARIABLES_AND_CONSTANTS = [ 'False', 'True', 'None', # "keywords" since Python 3 'self', 'Ellipsis', 'NotImplemented', ] IDENT_KIND = WordList.new(:ident). add(KEYWORDS, :keyword). add(OLD_KEYWORDS, :old_keyword). add(PREDEFINED_METHODS_AND_TYPES, :predefined). add(PREDEFINED_VARIABLES_AND_CONSTANTS, :pre_constant). add(PREDEFINED_EXCEPTIONS, :exception) NAME = / [^\W\d] \w* /x ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x OPERATOR = / \.\.\. | # ellipsis \.(?!\d) | # dot but not decimal point [,;:()\[\]{}] | # simple delimiters \/\/=? | \*\*=? | # special math [-+*\/%&|^]=? | # ordinary math and binary logic [~`] | # binary complement and inspection <<=? | >>=? | [<>=]=? | != # comparison and assignment /x STRING_DELIMITER_REGEXP = Hash.new do |h, delimiter| h[delimiter] = Regexp.union delimiter end STRING_CONTENT_REGEXP = Hash.new do |h, delimiter| h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x end DEF_NEW_STATE = WordList.new(:initial). add(%w(def), :def_expected). add(%w(import from), :include_expected). add(%w(class), :class_expected) DESCRIPTOR = / #{NAME} (?: \. #{NAME} )* | \* /x def scan_tokens tokens, options state = :initial string_delimiter = nil string_raw = false import_clause = class_name_follows = last_token_dot = false unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8' from_import_state = [] until eos? kind = nil match = nil if state == :string if scan(STRING_DELIMITER_REGEXP[string_delimiter]) tokens << [matched, :delimiter] tokens << [:close, :string] state = :initial next elsif string_delimiter.size == 3 && scan(/\n/) kind = :content elsif scan(STRING_CONTENT_REGEXP[string_delimiter]) kind = :content elsif !string_raw && scan(/ \\ #{ESCAPE} /ox) kind = :char elsif scan(/ \\ #{UNICODE_ESCAPE} /ox) kind = :char elsif scan(/ \\ . /x) kind = :content elsif scan(/ \\ | $ /x) tokens << [:close, :string] kind = :error state = :initial else raise_inspect "else case \" reached; %p not handled." % peek(1), tokens, state end elsif match = scan(/ [ \t]+ | \\\n /x) tokens << [match, :space] next elsif match = scan(/\n/) tokens << [match, :space] state = :initial if state == :include_expected next elsif match = scan(/ \# [^\n]* /mx) tokens << [match, :comment] next elsif state == :initial if scan(/#{OPERATOR}/o) kind = :operator elsif match = scan(/(u?r?|b)?("""|"|'''|')/i) tokens << [:open, :string] string_delimiter = self[2] string_raw = false modifiers = self[1] unless modifiers.empty? string_raw = !!modifiers.index(?r) tokens << [modifiers, :modifier] match = string_delimiter end state = :string kind = :delimiter # TODO: backticks elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) kind = IDENT_KIND[match] # TODO: keyword arguments kind = :ident if last_token_dot if kind == :old_keyword kind = check(/\(/) ? :ident : :keyword elsif kind == :predefined && check(/ *=/) kind = :ident elsif kind == :keyword state = DEF_NEW_STATE[match] from_import_state << match.to_sym if state == :include_expected end elsif scan(/@[a-zA-Z0-9_.]+[lL]?/) kind = :decorator elsif scan(/0[xX][0-9A-Fa-f]+[lL]?/) kind = :hex elsif scan(/0[bB][01]+[lL]?/) kind = :bin elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/) kind = :float if scan(/[jJ]/) match << matched kind = :imaginary end elsif scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/) kind = :oct elsif match = scan(/\d+([lL])?/) kind = :integer if self[1] == nil && scan(/[jJ]/) match << matched kind = :imaginary end else getch kind = :error end elsif state == :def_expected state = :initial if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) kind = :method else next end elsif state == :class_expected state = :initial if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) kind = :class else next end elsif state == :include_expected if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o) kind = :include if match == 'as' kind = :keyword from_import_state << :as elsif from_import_state.first == :from && match == 'import' kind = :keyword from_import_state << :import elsif from_import_state.last == :as # kind = match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method kind = :ident from_import_state.pop elsif IDENT_KIND[match] == :keyword unscan match = nil state = :initial next end elsif match = scan(/,/) from_import_state.pop if from_import_state.last == :as kind = :operator else from_import_state = [] state = :initial next end else raise_inspect 'Unknown state', tokens, state end match ||= matched if $DEBUG and not kind raise_inspect 'Error token %p in line %d' % [[match, kind], line], tokens, state end raise_inspect 'Empty token', tokens, state unless match last_token_dot = match == '.' tokens << [match, kind] end if state == :string tokens << [:close, :string] end tokens end end end end