module CodeRay # = Tokens # # The Tokens class represents a list of tokens returnd from # a Scanner. # # A token is not a special object, just a two-element Array # consisting of # * the _token_ _kind_ (a Symbol representing the type of the token) # * the _token_ _text_ (the original source of the token in a String) # # A token looks like this: # # [:comment, '# It looks like this'] # [:float, '3.1415926'] # [:error, 'äöü'] # # Some scanners also yield some kind of sub-tokens, represented by special # token texts, namely :open and :close . # # The Ruby scanner, for example, splits "a string" into: # # [ # [:open, :string], # [:delimiter, '"'], # [:content, 'a string'], # [:delimiter, '"'], # [:close, :string] # ] # # Tokens is also the interface between Scanners and Encoders: # The input is split and saved into a Tokens object. The Encoder # then builds the output from this object. # # Thus, the syntax below becomes clear: # # CodeRay.scan('price = 2.59', :ruby).html # # the Tokens object is here -------^ # # See how small it is? ;) # # Tokens gives you the power to handle pre-scanned code very easily: # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string # that you put in your DB. # # Tokens' subclass TokenStream allows streaming to save memory. class Tokens < Array class << self # Convert the token to a string. # # This format is used by Encoders.Tokens. # It can be reverted using read_token. def write_token text, type if text.is_a? String "#{type}\t#{escape(text)}\n" else ":#{text}\t#{type}\t\n" end end # Read a token from the string. # # Inversion of write_token. # # TODO Test this! def read_token token type, text = token.split("\t", 2) if type[0] == ?: [text.to_sym, type[1..-1].to_sym] else [type.to_sym, unescape(text)] end end # Escapes a string for use in write_token. def escape text text.gsub(/[\n\\]/, '\\\\\&') end # Unescapes a string created by escape. def unescape text text.gsub(/\\[\n\\]/) { |m| m[1,1] } end end # Whether the object is a TokenStream. # # Returns false. def stream? false end # Iterates over all tokens. # # If a filter is given, only tokens of that kind are yielded. def each kind_filter = nil, &block unless kind_filter super(&block) else super() do |text, kind| next unless kind == kind_filter yield text, kind end end end # Iterates over all text tokens. # Range tokens like [:open, :string] are left out. # # Example: # tokens.each_text_token { |text, kind| text.replace html_escape(text) } def each_text_token each do |text, kind| next unless text.is_a? ::String yield text, kind end end # Encode the tokens using encoder. # # encoder can be # * a symbol like :html oder :statistic # * an Encoder class # * an Encoder object # # options are passed to the encoder. def encode encoder, options = {} unless encoder.is_a? Encoders::Encoder unless encoder.is_a? Class encoder_class = Encoders[encoder] end encoder = encoder_class.new options end encoder.encode_tokens self, options end # Turn into a string using Encoders::Text. # # +options+ are passed to the encoder if given. def to_s options = {} encode :text, options end # Redirects unknown methods to encoder calls. # # For example, if you call +tokens.html+, the HTML encoder # is used to highlight the tokens. def method_missing meth, options = {} Encoders[meth].new(options).encode_tokens self end # Returns the tokens compressed by joining consecutive # tokens of the same kind. # # This can not be undone, but should yield the same output # in most Encoders. It basically makes the output smaller. # # Combined with dump, it saves space for the cost of time. # # If the scanner is written carefully, this is not required - # for example, consecutive //-comment lines could already be # joined in one comment token by the Scanner. def optimize print ' Tokens#optimize: before: %d - ' % size if $DEBUG last_kind = last_text = nil new = self.class.new for text, kind in self if text.is_a? String if kind == last_kind last_text << text else new << [last_text, last_kind] if last_kind last_text = text last_kind = kind end else new << [last_text, last_kind] if last_kind last_kind = last_text = nil new << [text, kind] end end new << [last_text, last_kind] if last_kind print 'after: %d (%d saved = %2.0f%%)' % [new.size, size - new.size, 1.0 - (new.size.to_f / size)] if $DEBUG new end # Compact the object itself; see optimize. def optimize! replace optimize end # Ensure that all :open tokens have a correspondent :close one. # # TODO: Test this! def fix # Check token nesting using a stack of kinds. opened = [] for token, kind in self if token == :open opened.push kind elsif token == :close expected = opened.pop if kind != expected # Unexpected :close; decide what to do based on the kind: # - token was opened earlier: also close tokens in between # - token was never opened: delete the :close (skip with next) next unless opened.rindex expected tokens << [:close, kind] until (kind = opened.pop) == expected end end tokens << [token, kind] end # Close remaining opened tokens tokens << [:close, kind] while kind = opened.pop tokens end def fix! replace fix end # Makes sure that: # - newlines are single tokens # (which means all other token are single-line) # - there are no open tokens at the end the line # # This makes it simple for encoders that work line-oriented, # like HTML with list-style numeration. def split_into_lines raise NotImplementedError end def split_into_lines! replace split_into_lines end # Dumps the object into a String that can be saved # in files or databases. # # The dump is created with Marshal.dump; # In addition, it is gzipped using GZip.gzip. # # The returned String object includes Undumping # so it has an #undump method. See Tokens.load. # # You can configure the level of compression, # but the default value 7 should be what you want # in most cases as it is a good compromise between # speed and compression rate. # # See GZip module. def dump gzip_level = 7 require 'coderay/helpers/gzip_simple' dump = Marshal.dump self dump = dump.gzip gzip_level dump.extend Undumping end # The total size of the tokens. # Should be equal to the input size before # scanning. def text_size size = 0 each_text_token do |t, k| size + t.size end size end # The total size of the tokens. # Should be equal to the input size before # scanning. def text map { |t, k| t if t.is_a? ::String }.join end # Include this module to give an object an #undump # method. # # The string returned by Tokens.dump includes Undumping. module Undumping # Calls Tokens.load with itself. def undump Tokens.load self end end # Undump the object using Marshal.load, then # unzip it using GZip.gunzip. # # The result is commonly a Tokens object, but # this is not guaranteed. def Tokens.load dump require 'coderay/helpers/gzip_simple' dump = dump.gunzip @dump = Marshal.load dump end end # = TokenStream # # The TokenStream class is a fake Array without elements. # # It redirects the method << to a block given at creation. # # This allows scanners and Encoders to use streaming (no # tokens are saved, the input is highlighted the same time it # is scanned) with the same code. # # See CodeRay.encode_stream and CodeRay.scan_stream class TokenStream < Tokens # Whether the object is a TokenStream. # # Returns true. def stream? true end # The Array is empty, but size counts the tokens given by <<. attr_reader :size # Creates a new TokenStream that calls +block+ whenever # its << method is called. # # Example: # # require 'coderay' # # token_stream = CodeRay::TokenStream.new do |kind, text| # puts 'kind: %s, text size: %d.' % [kind, text.size] # end # # token_stream << [:regexp, '/\d+/'] # #-> kind: rexpexp, text size: 5. # def initialize &block raise ArgumentError, 'Block expected for streaming.' unless block @callback = block @size = 0 end # Calls +block+ with +token+ and increments size. # # Returns self. def << token @callback.call token @size += 1 self end # This method is not implemented due to speed reasons. Use Tokens. def text_size raise NotImplementedError, 'This method is not implemented due to speed reasons.' end # A TokenStream cannot be dumped. Use Tokens. def dump raise NotImplementedError, 'A TokenStream cannot be dumped.' end # A TokenStream cannot be optimized. Use Tokens. def optimize raise NotImplementedError, 'A TokenStream cannot be optimized.' end end # Token name abbreviations require 'coderay/token_classes' end