Redmine/vendor/gems/coderay-1.0.0/lib/coderay/scanner.rb

# encoding: utf-8
require 'strscan'

module CodeRay

  autoload :WordList, 'coderay/helpers/word_list'
  
  # = Scanners
  #
  # This module holds the Scanner class and its subclasses.
  # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
  # can be found in coderay/scanners/ruby.
  #
  # Scanner also provides methods and constants for the register
  # mechanism and the [] method that returns the Scanner class
  # belonging to the given lang.
  #
  # See PluginHost.
  module Scanners
    extend PluginHost
    plugin_path File.dirname(__FILE__), 'scanners'
    
    
    # = Scanner
    #
    # The base class for all Scanners.
    #
    # It is a subclass of Ruby's great +StringScanner+, which
    # makes it easy to access the scanning methods inside.
    #
    # It is also +Enumerable+, so you can use it like an Array of
    # Tokens:
    #
    #   require 'coderay'
    #   
    #   c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
    #   
    #   for text, kind in c_scanner
    #     puts text if kind == :operator
    #   end
    #   
    #   # prints: (*==)++;
    #
    # OK, this is a very simple example :)
    # You can also use +map+, +any?+, +find+ and even +sort_by+,
    # if you want.
    class Scanner < StringScanner
      
      extend Plugin
      plugin_host Scanners
      
      # Raised if a Scanner fails while scanning
      ScanError = Class.new StandardError
      
      # The default options for all scanner classes.
      #
      # Define @default_options for subclasses.
      DEFAULT_OPTIONS = { }
      
      KINDS_NOT_LOC = [:comment, :doctype, :docstring]
      
      attr_accessor :state
      
      class << self
        
        # Normalizes the given code into a string with UNIX newlines, in the
        # scanner's internal encoding, with invalid and undefined charachters
        # replaced by placeholders. Always returns a new object.
        def normalize code
          # original = code
          code = code.to_s unless code.is_a? ::String
          return code if code.empty?
          
          if code.respond_to? :encoding
            code = encode_with_encoding code, self.encoding
          else
            code = to_unix code
          end
          # code = code.dup if code.eql? original
          code
        end
        
        # The typical filename suffix for this scanner's language.
        def file_extension extension = lang
          @file_extension ||= extension.to_s
        end
        
        # The encoding used internally by this scanner.
        def encoding name = 'UTF-8'
          @encoding ||= defined?(Encoding.find) && Encoding.find(name)
        end
        
        # The lang of this Scanner class, which is equal to its Plugin ID.
        def lang
          @plugin_id
        end
        
      protected
        
        def encode_with_encoding code, target_encoding
          if code.encoding == target_encoding
            if code.valid_encoding?
              return to_unix(code)
            else
              source_encoding = guess_encoding code
            end
          else
            source_encoding = code.encoding
          end
          # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
          code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
        end
        
        def to_unix code
          code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
        end
        
        def guess_encoding s
          #:nocov:
          IO.popen("file -b --mime -", "w+") do |file|
            file.write s[0, 1024]
            file.close_write
            begin
              Encoding.find file.gets[/charset=([-\w]+)/, 1]
            rescue ArgumentError
              Encoding::BINARY
            end
          end
          #:nocov:
        end
        
      end
      
      # Create a new Scanner.
      #
      # * +code+ is the input String and is handled by the superclass
      #   StringScanner.
      # * +options+ is a Hash with Symbols as keys.
      #   It is merged with the default options of the class (you can
      #   overwrite default options here.)
      #
      # Else, a Tokens object is used.
      def initialize code = '', options = {}
        if self.class == Scanner
          raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
        end
        
        @options = self.class::DEFAULT_OPTIONS.merge options
        
        super self.class.normalize(code)
        
        @tokens = options[:tokens] || Tokens.new
        @tokens.scanner = self if @tokens.respond_to? :scanner=
        
        setup
      end
      
      # Sets back the scanner. Subclasses should redefine the reset_instance
      # method instead of this one.
      def reset
        super
        reset_instance
      end
      
      # Set a new string to be scanned.
      def string= code
        code = self.class.normalize(code)
        super code
        reset_instance
      end
      
      # the Plugin ID for this scanner
      def lang
        self.class.lang
      end
      
      # the default file extension for this scanner
      def file_extension
        self.class.file_extension
      end
      
      # Scan the code and returns all tokens in a Tokens object.
      def tokenize source = nil, options = {}
        options = @options.merge(options)
        @tokens = options[:tokens] || @tokens || Tokens.new
        @tokens.scanner = self if @tokens.respond_to? :scanner=
        case source
        when Array
          self.string = self.class.normalize(source.join)
        when nil
          reset
        else
          self.string = self.class.normalize(source)
        end
        
        begin
          scan_tokens @tokens, options
        rescue => e
          message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
          raise_inspect e.message, @tokens, message, 30, e.backtrace
        end
        
        @cached_tokens = @tokens
        if source.is_a? Array
          @tokens.split_into_parts(*source.map { |part| part.size })
        else
          @tokens
        end
      end
      
      # Cache the result of tokenize.
      def tokens
        @cached_tokens ||= tokenize
      end
      
      # Traverse the tokens.
      def each &block
        tokens.each(&block)
      end
      include Enumerable
      
      # The current line position of the scanner, starting with 1.
      # See also: #column.
      #
      # Beware, this is implemented inefficiently. It should be used
      # for debugging only.
      def line pos = self.pos
        return 1 if pos <= 0
        binary_string[0...pos].count("\n") + 1
      end
      
      # The current column position of the scanner, starting with 1.
      # See also: #line.
      def column pos = self.pos
        return 1 if pos <= 0
        pos - (binary_string.rindex(?\n, pos - 1) || -1)
      end
      
      # The string in binary encoding.
      # 
      # To be used with #pos, which is the index of the byte the scanner
      # will scan next.
      def binary_string
        @binary_string ||=
          if string.respond_to?(:bytesize) && string.bytesize != string.size
            #:nocov:
            string.dup.force_encoding('binary')
            #:nocov:
          else
            string
          end
      end
      
    protected
      
      # Can be implemented by subclasses to do some initialization
      # that has to be done once per instance.
      #
      # Use reset for initialization that has to be done once per
      # scan.
      def setup  # :doc:
      end
      
      # This is the central method, and commonly the only one a
      # subclass implements.
      #
      # Subclasses must implement this method; it must return +tokens+
      # and must only use Tokens#<< for storing scanned tokens!
      def scan_tokens tokens, options  # :doc:
        raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
      end
      
      # Resets the scanner.
      def reset_instance
        @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
        @cached_tokens = nil
        @binary_string = nil if defined? @binary_string
      end
      
      # Scanner error with additional status information
      def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller
        raise ScanError, <<-EOE % [


***ERROR in %s: %s (after %d tokens)

tokens:
%s

current line: %d  column: %d  pos: %d
matched: %p  state: %p
bol? = %p,  eos? = %p

surrounding code:
%p  ~~  %p


***ERROR***

        EOE
          File.basename(caller[0]),
          msg,
          tokens.respond_to?(:size) ? tokens.size : 0,
          tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '',
          line, column, pos,
          matched, state, bol?, eos?,
          binary_string[pos - ambit, ambit],
          binary_string[pos, ambit],
        ], backtrace
      end
      
      # Shorthand for scan_until(/\z/).
      # This method also avoids a JRuby 1.9 mode bug.
      def scan_rest
        rest = self.rest
        terminate
        rest
      end
      
    end
    
  end
end
Update CodeRay version to 1.0 final (#4264). git-svn-id: svn+ssh://rubyforge.org/var/svn/redmine/trunk@7618 e93f8b46-1217-0410-a6f0-8f06a7374b81 2011-10-08 17:34:30 +04:00			`# encoding: utf-8`
			`require 'strscan'`

			`module CodeRay`

			`autoload :WordList, 'coderay/helpers/word_list'`

			`# = Scanners`
			`#`
			`# This module holds the Scanner class and its subclasses.`
			`# For example, the Ruby scanner is named CodeRay::Scanners::Ruby`
			`# can be found in coderay/scanners/ruby.`
			`#`
			`# Scanner also provides methods and constants for the register`
			`# mechanism and the [] method that returns the Scanner class`
			`# belonging to the given lang.`
			`#`
			`# See PluginHost.`
			`module Scanners`
			`extend PluginHost`
			`plugin_path File.dirname(__FILE__), 'scanners'`


			`# = Scanner`
			`#`
			`# The base class for all Scanners.`
			`#`
			`# It is a subclass of Ruby's great +StringScanner+, which`
			`# makes it easy to access the scanning methods inside.`
			`#`
			`# It is also +Enumerable+, so you can use it like an Array of`
			`# Tokens:`
			`#`
			`# require 'coderay'`
			`#`
			`# c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"`
			`#`
			`# for text, kind in c_scanner`
			`# puts text if kind == :operator`
			`# end`
			`#`
			`# # prints: (*==)++;`
			`#`
			`# OK, this is a very simple example :)`
			`# You can also use +map+, +any?+, +find+ and even +sort_by+,`
			`# if you want.`
			`class Scanner < StringScanner`

			`extend Plugin`
			`plugin_host Scanners`

			`# Raised if a Scanner fails while scanning`
			`ScanError = Class.new StandardError`

			`# The default options for all scanner classes.`
			`#`
			`# Define @default_options for subclasses.`
			`DEFAULT_OPTIONS = { }`

			`KINDS_NOT_LOC = [:comment, :doctype, :docstring]`

			`attr_accessor :state`

			`class << self`

			`# Normalizes the given code into a string with UNIX newlines, in the`
			`# scanner's internal encoding, with invalid and undefined charachters`
			`# replaced by placeholders. Always returns a new object.`
			`def normalize code`
			`# original = code`
			`code = code.to_s unless code.is_a? ::String`
			`return code if code.empty?`

			`if code.respond_to? :encoding`
			`code = encode_with_encoding code, self.encoding`
			`else`
			`code = to_unix code`
			`end`
			`# code = code.dup if code.eql? original`
			`code`
			`end`

			`# The typical filename suffix for this scanner's language.`
			`def file_extension extension = lang`
			`@file_extension \|\|= extension.to_s`
			`end`

			`# The encoding used internally by this scanner.`
			`def encoding name = 'UTF-8'`
			`@encoding \|\|= defined?(Encoding.find) && Encoding.find(name)`
			`end`

			`# The lang of this Scanner class, which is equal to its Plugin ID.`
			`def lang`
			`@plugin_id`
			`end`

			`protected`

			`def encode_with_encoding code, target_encoding`
			`if code.encoding == target_encoding`
			`if code.valid_encoding?`
			`return to_unix(code)`
			`else`
			`source_encoding = guess_encoding code`
			`end`
			`else`
			`source_encoding = code.encoding`
			`end`
			`# print "encode_with_encoding from #{source_encoding} to #{target_encoding}"`
			`code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace`
			`end`

			`def to_unix code`
			`code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code`
			`end`

			`def guess_encoding s`
			`#:nocov:`
			`IO.popen("file -b --mime -", "w+") do \|file\|`
			`file.write s[0, 1024]`
			`file.close_write`
			`begin`
			`Encoding.find file.gets[/charset=([-\w]+)/, 1]`
			`rescue ArgumentError`
			`Encoding::BINARY`
			`end`
			`end`
			`#:nocov:`
			`end`

			`end`

			`# Create a new Scanner.`
			`#`
			`# * +code+ is the input String and is handled by the superclass`
			`# StringScanner.`
			`# * +options+ is a Hash with Symbols as keys.`
			`# It is merged with the default options of the class (you can`
			`# overwrite default options here.)`
			`#`
			`# Else, a Tokens object is used.`
			`def initialize code = '', options = {}`
			`if self.class == Scanner`
			`raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."`
			`end`

			`@options = self.class::DEFAULT_OPTIONS.merge options`

			`super self.class.normalize(code)`

			`@tokens = options[:tokens] \|\| Tokens.new`
			`@tokens.scanner = self if @tokens.respond_to? :scanner=`

			`setup`
			`end`

			`# Sets back the scanner. Subclasses should redefine the reset_instance`
			`# method instead of this one.`
			`def reset`
			`super`
			`reset_instance`
			`end`

			`# Set a new string to be scanned.`
			`def string= code`
			`code = self.class.normalize(code)`
			`super code`
			`reset_instance`
			`end`

			`# the Plugin ID for this scanner`
			`def lang`
			`self.class.lang`
			`end`

			`# the default file extension for this scanner`
			`def file_extension`
			`self.class.file_extension`
			`end`

			`# Scan the code and returns all tokens in a Tokens object.`
			`def tokenize source = nil, options = {}`
			`options = @options.merge(options)`
			`@tokens = options[:tokens] \|\| @tokens \|\| Tokens.new`
			`@tokens.scanner = self if @tokens.respond_to? :scanner=`
			`case source`
			`when Array`
			`self.string = self.class.normalize(source.join)`
			`when nil`
			`reset`
			`else`
			`self.string = self.class.normalize(source)`
			`end`

			`begin`
			`scan_tokens @tokens, options`
			`rescue => e`
			`message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]`
			`raise_inspect e.message, @tokens, message, 30, e.backtrace`
			`end`

			`@cached_tokens = @tokens`
			`if source.is_a? Array`
			`@tokens.split_into_parts(*source.map { \|part\| part.size })`
			`else`
			`@tokens`
			`end`
			`end`

			`# Cache the result of tokenize.`
			`def tokens`
			`@cached_tokens \|\|= tokenize`
			`end`

			`# Traverse the tokens.`
			`def each &block`
			`tokens.each(&block)`
			`end`
			`include Enumerable`

			`# The current line position of the scanner, starting with 1.`
			`# See also: #column.`
			`#`
			`# Beware, this is implemented inefficiently. It should be used`
			`# for debugging only.`
			`def line pos = self.pos`
			`return 1 if pos <= 0`
			`binary_string[0...pos].count("\n") + 1`
			`end`

			`# The current column position of the scanner, starting with 1.`
			`# See also: #line.`
			`def column pos = self.pos`
			`return 1 if pos <= 0`
			`pos - (binary_string.rindex(?\n, pos - 1) \|\| -1)`
			`end`

			`# The string in binary encoding.`
			`#`
			`# To be used with #pos, which is the index of the byte the scanner`
			`# will scan next.`
			`def binary_string`
			`@binary_string \|\|=`
			`if string.respond_to?(:bytesize) && string.bytesize != string.size`
			`#:nocov:`
			`string.dup.force_encoding('binary')`
			`#:nocov:`
			`else`
			`string`
			`end`
			`end`

			`protected`

			`# Can be implemented by subclasses to do some initialization`
			`# that has to be done once per instance.`
			`#`
			`# Use reset for initialization that has to be done once per`
			`# scan.`
			`def setup # :doc:`
			`end`

			`# This is the central method, and commonly the only one a`
			`# subclass implements.`
			`#`
			`# Subclasses must implement this method; it must return +tokens+`
			`# and must only use Tokens#<< for storing scanned tokens!`
			`def scan_tokens tokens, options # :doc:`
			`raise NotImplementedError, "#{self.class}#scan_tokens not implemented."`
			`end`

			`# Resets the scanner.`
			`def reset_instance`
			`@tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]`
			`@cached_tokens = nil`
			`@binary_string = nil if defined? @binary_string`
			`end`

			`# Scanner error with additional status information`
			`def raise_inspect msg, tokens, state = self.state \|\| 'No state given!', ambit = 30, backtrace = caller`
			`raise ScanError, <<-EOE % [`


			`***ERROR in %s: %s (after %d tokens)`

			`tokens:`
			`%s`

			`current line: %d column: %d pos: %d`
			`matched: %p state: %p`
			`bol? = %p, eos? = %p`

			`surrounding code:`
			`%p ~~ %p`


			`*ERROR*`

			`EOE`
			`File.basename(caller[0]),`
			`msg,`
			`tokens.respond_to?(:size) ? tokens.size : 0,`
			`tokens.respond_to?(:last) ? tokens.last(10).map { \|t\| t.inspect }.join("\n") : '',`
			`line, column, pos,`
			`matched, state, bol?, eos?,`
			`binary_string[pos - ambit, ambit],`
			`binary_string[pos, ambit],`
			`], backtrace`
			`end`

			`# Shorthand for scan_until(/\z/).`
			`# This method also avoids a JRuby 1.9 mode bug.`
			`def scan_rest`
			`rest = self.rest`
			`terminate`
			`rest`
			`end`

			`end`

			`end`
			`end`