293 lines
7.6 KiB
Ruby
293 lines
7.6 KiB
Ruby
module CodeRay
|
|
|
|
require 'coderay/helpers/plugin'
|
|
|
|
# = Scanners
|
|
#
|
|
# This module holds the Scanner class and its subclasses.
|
|
# For example, the Ruby scanner is named CodeRay::Scanners::Ruby
|
|
# can be found in coderay/scanners/ruby.
|
|
#
|
|
# Scanner also provides methods and constants for the register
|
|
# mechanism and the [] method that returns the Scanner class
|
|
# belonging to the given lang.
|
|
#
|
|
# See PluginHost.
|
|
module Scanners
|
|
extend PluginHost
|
|
plugin_path File.dirname(__FILE__), 'scanners'
|
|
|
|
require 'strscan'
|
|
|
|
# = Scanner
|
|
#
|
|
# The base class for all Scanners.
|
|
#
|
|
# It is a subclass of Ruby's great +StringScanner+, which
|
|
# makes it easy to access the scanning methods inside.
|
|
#
|
|
# It is also +Enumerable+, so you can use it like an Array of
|
|
# Tokens:
|
|
#
|
|
# require 'coderay'
|
|
#
|
|
# c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
|
|
#
|
|
# for text, kind in c_scanner
|
|
# puts text if kind == :operator
|
|
# end
|
|
#
|
|
# # prints: (*==)++;
|
|
#
|
|
# OK, this is a very simple example :)
|
|
# You can also use +map+, +any?+, +find+ and even +sort_by+,
|
|
# if you want.
|
|
class Scanner < StringScanner
|
|
|
|
extend Plugin
|
|
plugin_host Scanners
|
|
|
|
# Raised if a Scanner fails while scanning
|
|
ScanError = Class.new(Exception)
|
|
|
|
require 'coderay/helpers/word_list'
|
|
|
|
# The default options for all scanner classes.
|
|
#
|
|
# Define @default_options for subclasses.
|
|
DEFAULT_OPTIONS = { :stream => false }
|
|
|
|
KINDS_NOT_LOC = [:comment, :doctype]
|
|
|
|
class << self
|
|
|
|
# Returns if the Scanner can be used in streaming mode.
|
|
def streamable?
|
|
is_a? Streamable
|
|
end
|
|
|
|
def normify code
|
|
code = code.to_s
|
|
if code.respond_to? :force_encoding
|
|
debug, $DEBUG = $DEBUG, false
|
|
begin
|
|
code.force_encoding 'utf-8'
|
|
code[/\z/] # raises an ArgumentError when code contains a non-UTF-8 char
|
|
rescue ArgumentError
|
|
code.force_encoding 'binary'
|
|
ensure
|
|
$DEBUG = debug
|
|
end
|
|
end
|
|
code.to_unix
|
|
end
|
|
|
|
def file_extension extension = nil
|
|
if extension
|
|
@file_extension = extension.to_s
|
|
else
|
|
@file_extension ||= plugin_id.to_s
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
=begin
|
|
## Excluded for speed reasons; protected seems to make methods slow.
|
|
|
|
# Save the StringScanner methods from being called.
|
|
# This would not be useful for highlighting.
|
|
strscan_public_methods =
|
|
StringScanner.instance_methods -
|
|
StringScanner.ancestors[1].instance_methods
|
|
protected(*strscan_public_methods)
|
|
=end
|
|
|
|
# Create a new Scanner.
|
|
#
|
|
# * +code+ is the input String and is handled by the superclass
|
|
# StringScanner.
|
|
# * +options+ is a Hash with Symbols as keys.
|
|
# It is merged with the default options of the class (you can
|
|
# overwrite default options here.)
|
|
# * +block+ is the callback for streamed highlighting.
|
|
#
|
|
# If you set :stream to +true+ in the options, the Scanner uses a
|
|
# TokenStream with the +block+ as callback to handle the tokens.
|
|
#
|
|
# Else, a Tokens object is used.
|
|
def initialize code='', options = {}, &block
|
|
raise "I am only the basic Scanner class. I can't scan "\
|
|
"anything. :( Use my subclasses." if self.class == Scanner
|
|
|
|
@options = self.class::DEFAULT_OPTIONS.merge options
|
|
|
|
super Scanner.normify(code)
|
|
|
|
@tokens = options[:tokens]
|
|
if @options[:stream]
|
|
warn "warning in CodeRay::Scanner.new: :stream is set, "\
|
|
"but no block was given" unless block_given?
|
|
raise NotStreamableError, self unless kind_of? Streamable
|
|
@tokens ||= TokenStream.new(&block)
|
|
else
|
|
warn "warning in CodeRay::Scanner.new: Block given, "\
|
|
"but :stream is #{@options[:stream]}" if block_given?
|
|
@tokens ||= Tokens.new
|
|
end
|
|
@tokens.scanner = self
|
|
|
|
setup
|
|
end
|
|
|
|
def reset
|
|
super
|
|
reset_instance
|
|
end
|
|
|
|
def string= code
|
|
code = Scanner.normify(code)
|
|
super code
|
|
reset_instance
|
|
end
|
|
|
|
# More mnemonic accessor name for the input string.
|
|
alias code string
|
|
alias code= string=
|
|
|
|
# Returns the Plugin ID for this scanner.
|
|
def lang
|
|
self.class.plugin_id
|
|
end
|
|
|
|
# Scans the code and returns all tokens in a Tokens object.
|
|
def tokenize new_string=nil, options = {}
|
|
options = @options.merge(options)
|
|
self.string = new_string if new_string
|
|
@cached_tokens =
|
|
if @options[:stream] # :stream must have been set already
|
|
reset unless new_string
|
|
scan_tokens @tokens, options
|
|
@tokens
|
|
else
|
|
scan_tokens @tokens, options
|
|
end
|
|
end
|
|
|
|
def tokens
|
|
@cached_tokens ||= tokenize
|
|
end
|
|
|
|
# Whether the scanner is in streaming mode.
|
|
def streaming?
|
|
!!@options[:stream]
|
|
end
|
|
|
|
# Traverses the tokens.
|
|
def each &block
|
|
raise ArgumentError,
|
|
'Cannot traverse TokenStream.' if @options[:stream]
|
|
tokens.each(&block)
|
|
end
|
|
include Enumerable
|
|
|
|
# The current line position of the scanner.
|
|
#
|
|
# Beware, this is implemented inefficiently. It should be used
|
|
# for debugging only.
|
|
def line
|
|
string[0..pos].count("\n") + 1
|
|
end
|
|
|
|
def column pos = self.pos
|
|
return 0 if pos <= 0
|
|
string = string()
|
|
if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size)
|
|
@bin_string ||= string.dup.force_encoding('binary')
|
|
string = @bin_string
|
|
end
|
|
pos - (string.rindex(?\n, pos) || 0)
|
|
end
|
|
|
|
def marshal_dump
|
|
@options
|
|
end
|
|
|
|
def marshal_load options
|
|
@options = options
|
|
end
|
|
|
|
protected
|
|
|
|
# Can be implemented by subclasses to do some initialization
|
|
# that has to be done once per instance.
|
|
#
|
|
# Use reset for initialization that has to be done once per
|
|
# scan.
|
|
def setup
|
|
end
|
|
|
|
# This is the central method, and commonly the only one a
|
|
# subclass implements.
|
|
#
|
|
# Subclasses must implement this method; it must return +tokens+
|
|
# and must only use Tokens#<< for storing scanned tokens!
|
|
def scan_tokens tokens, options
|
|
raise NotImplementedError,
|
|
"#{self.class}#scan_tokens not implemented."
|
|
end
|
|
|
|
def reset_instance
|
|
@tokens.clear unless @options[:keep_tokens]
|
|
@cached_tokens = nil
|
|
@bin_string = nil if defined? @bin_string
|
|
end
|
|
|
|
# Scanner error with additional status information
|
|
def raise_inspect msg, tokens, state = 'No state given!', ambit = 30
|
|
raise ScanError, <<-EOE % [
|
|
|
|
|
|
***ERROR in %s: %s (after %d tokens)
|
|
|
|
tokens:
|
|
%s
|
|
|
|
current line: %d column: %d pos: %d
|
|
matched: %p state: %p
|
|
bol? = %p, eos? = %p
|
|
|
|
surrounding code:
|
|
%p ~~ %p
|
|
|
|
|
|
***ERROR***
|
|
|
|
EOE
|
|
File.basename(caller[0]),
|
|
msg,
|
|
tokens.size,
|
|
tokens.last(10).map { |t| t.inspect }.join("\n"),
|
|
line, column, pos,
|
|
matched, state, bol?, eos?,
|
|
string[pos - ambit, ambit],
|
|
string[pos, ambit],
|
|
]
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
end
|
|
|
|
class String
|
|
# I love this hack. It seems to silence all dos/unix/mac newline problems.
|
|
def to_unix
|
|
if index ?\r
|
|
gsub(/\r\n?/, "\n")
|
|
else
|
|
self
|
|
end
|
|
end
|
|
end
|