#!/usr/bin/python

# Search Highlighter
# Copyright (C) 2003 Neil Fraser, Scotland
# http://neil.fraser.name/

# This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.  http://www.gnu.org/

# The following meta tag in a document will turn the highlighter on or off:
# <META NAME="highlighter_on" VALUE="1">
# <META NAME="highlighter_on" VALUE="0">
# If no such meta tag is found, the default behaviour is:
highlighter_on = 1

# The following meta tag in a document will turn the jump-to-first-match on or off:
# <META NAME="highlighter_jump" VALUE="1">
# <META NAME="highlighter_jump" VALUE="0">
# If no such meta tag is found, the default behaviour is:
highlighter_jump = 1

# The following meta tag in a document will set the highlighting start tag:
# <META NAME="highlighter_starttag" VALUE="<BLINK>">
# If no such meta tag is found, the default start tag is:
highlighter_starttag = '<B STYLE="color: black; background-color: rgb(255, 255, 102);">'

# The following meta tag in a document will set the highlighting end tag:
# <META NAME="highlighter_endtag" VALUE="<BLINK>">
# If no such meta tag is found, the default end tag is:
highlighter_endtag = '</B>'

from HTMLParser import HTMLParser
from HTMLParser import HTMLParseError
import string
import sys
import os

# Build an efficient translation table with which to clean up both the query string and the web page text.
global cleanuptrans
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz' # Older pythons don't have string.ascii_lowercase
cleanuptrans = string.maketrans(ascii_lowercase.upper(), ascii_lowercase)
for x in range(len(cleanuptrans)):
  if (string.digits+ascii_lowercase).find(cleanuptrans[x]) == -1:
    cleanuptrans = cleanuptrans.replace(cleanuptrans[x], ' ')

global matches # List of all matches found
global terms   # List of all search terms

class MyHTMLParser(HTMLParser):

  body = 0     # Keep track of whether we are inside the body block or not.
  script = 0   # Keep track of whether we are inside a script block or not.
  textarea = 0 # Keep track of whether we are inside a textarea or not.

  def handle_starttag(self, tag, attrs):
    # print "Encountered the beginning of a <"+tag+"> tag", self.getpos()
    if tag == 'script':
      self.script = 1
    elif tag == 'body' and self.script == 0:
      self.body = 1
    elif tag == 'textarea' and self.script == 0:
      self.textarea = 1
    elif tag == 'meta':
      # Look for meta tag directives for highlighter_on and highlighter_jump.
      name = ''
      content = ''
      for attr in attrs:
        if attr[0] == 'name':
          name = attr[1]
        elif attr[0] == 'content':
          content = attr[1].strip()
      if name == 'highlighter_on':
        global highlighter_on
        if content == '0':
          highlighter_on = 0
        elif content == '1':
          highlighter_on = 1
      elif name == 'highlighter_jump':
        global highlighter_jump
        if content == '0':
          highlighter_jump = 0
        elif content == '1':
          highlighter_jump = 1
      elif name == 'highlighter_sarttag':
        global highlighter_starttag
        highlighter_starttag = content
      elif name == 'highlighter_endtag':
        global highlighter_endtag
        highlighter_endtag = content

  def handle_endtag(self, tag):
    # print "Encountered the end of a <"+tag+"> tag", self.getpos()
    if tag == 'script':
      self.script = 0
    elif tag == 'textarea':
      self.textarea = 0

  def handle_data(self, data):
    # Feed every word of the page into each of the term-matching objects.
    # Keep track of the exact coordinates of each word in the page.
    if self.script == 0 and self.textarea == 0 and self.body == 1:
      data = data.translate(cleanuptrans)
      if data.rstrip() != '':
        (y, x) = self.getpos()
        # print "Data: '%s' (%s, %s)" % (data, x, y)
        for word in data.split(' '):
          if word:
            [termObj.feed(word, x, y) for termObj in terms]
          x = x + len(word)+1


class singleTermObj:
  # This simple object stores and matches a single-word search term.
  term = ""

  def __init__(self, term):
    # set the term for this object
    self.term = term

  def feed(self, word, x, y):
    # Does the word match this term?
    if self.term == word:
      matches.append((x, y, word))


class multiTermObj:
  # This object stores and matches a quoted search term with multiple words
  # and potentially some wildcards (yes, Google does wildcards).
  terms = ()
  queue = []
  wild = 0 # are there any wildcards
  termlength = 0 # number of terms

  def __init__(self, term):
    # set the terms for this object
    self.terms = term.split()
    # define a couple of time-saving props
    if "[WILDCARD]" in self.terms:
      self.wild = 1
    self.termlength = len(self.terms)

  def feed(self, word, x, y):
    # Take another word from the input stream, and see if it matches.
    if word in self.terms or self.wild:
      self.queue.append((x, y, word))
      if len(self.queue) > self.termlength:
        self.queue.pop(0)
      if len(self.queue) == self.termlength:
        for i in range(self.termlength):
          if (self.terms[i] != "[WILDCARD]" and self.terms[i] != self.queue[i][2]):
            break # no match
        else:
          # Entire phrase matched!
          matches.extend(self.queue)
    else:
      # Forget it.  No match possible.
      self.queue = []


def parse_q(q):
  # Parse through the query string and break it into words (respecting double-quotes).
  # Don't remove "-negated" terms since a) they probably aren't on the page and b) if they are, you probably want to know about them.
  statequote = 0 # 0=unquoted 1=quoted
  word = ''
  words = []
  for x in range(len(q)):
    if statequote:
      if q[x] == '"':
        statequote = 0
      else:
        word = word + q[x]
    else:
      if q[x] == '"':
        statequote = 1
        if word:
          words.append(word)
          word = ''
      elif q[x] == ' ':
        if word:
          words.append(word)
          word = ''
      else:
        word = word + q[x]
  if word:
    words.append(word)

  # The query is broken up, now clean up the peices
  terms = {}
  q_cleanuptrans = cleanuptrans[:ord("*")]+"*"+cleanuptrans[ord("*")+1:]
  for word in words:
    # protect * terms since they are wildcards, but nuke everything else
    word = " "+word+" "
    word = word.translate(q_cleanuptrans)
    word = word.replace(" * ", " [WILDCARD] ")
    word = word.replace("*", " ")

    while word.find('  ') != -1:
      word = word.replace('  ', ' ')
    word = word.strip()
    # remove duplicates, empty terms, lone wildcards, and comon words (as defined by Google).
    if word not in ('', '[WILDCARD]', 'the','of','and','a','to','in','is','that','it','for','was','on','are','as','with','at','be','this','from','I','or','by','what','when','an','which','will','about','how','who','where'):
      terms[word] = 1
  words = terms.keys()
  words.sort()
  #print "<!-- DEBUG: words =", words, "-->"
  return words

########################################

# Get the query from the refering page
q = os.environ.get('HTTP_REFERER') or ''
print '<!-- DEBUG: referer = ', q, '-->'
i = q.find('?')
if i == -1:
  # No query (Apache shouldn't have sent us this)
  q = ''
else:
  q = '&'+q[i+1:]
  i = q.find('&q=')
  if i != -1: # Google, AltaVista, AllTheWeb, MSN, etc.
    q = q[i+3:]
  else:
    i = q.find('&query=') # Lycos, AOL, HotBot, Netscape, etc.
    if i != -1:
      q = q[i+7:]
    else: # No 'q' or 'query' term (Apache shouldn't have sent us this)
      q = ''
  i = q.find('&') # Drop subsequent terms
  if i != -1:
    q = q[:i]

  q = q.replace('+', '%20') # '+' is a shortcut for '%20'

  i = q.find('%') # Deal with all the '%nn' sequences
  while(i != -1):
    nn = ' '
    if q[i:i+3] == '%22': # Double quotes
      nn = '"'
    elif q[i:i+3] == '%2B': # Plus symbol
      nn = '+'
    q = q[:i]+nn+q[i+3:]
    i = q.find('%')
print '<!-- DEBUG: q =', q, '-->'

# Convert the query string into a bunch of term objects.
terms = []
for term in parse_q(q):
  if term.find(" ") == -1:
    terms.append(singleTermObj(term))
  else:
    terms.append(multiTermObj(term))

# Load in the HTML document.
data = sys.stdin.read()
data = data.replace("\r\n", "\n") # Python likes LF, not CR/LF
data = data.replace("\r", "\n") # Rogue CR?

# Run the HTML through the parser and matcher.
matches = []
if (terms):
  parser = MyHTMLParser()
  try:
    parser.feed(data)
    parser.close()
  except HTMLParseError, msg:
    # Here be illegal HTML.  Run away and pretend we were never here.
    print "<!-- ERROR: highlighter's html parser crashed:", sys.exc_info()[0], "\n", msg, "-->"
    matches = []
  #print "<!-- DEBUG: matches =", matches, "-->"

# Highlight all matches.
if (highlighter_on and matches):
  # The x/y coordinates we get from HTMLParser refer to the start of each span of text.
  # The location where the word is actually located may be on a later line if the original
  # text block wrapped.  So we need to check each coordinate and locate its real position.
  locations = {}
  data = data.splitlines()
  for (startX, startY, word) in matches:
    startY = startY - 1 # HTMLParser counts from 1, not 0
    while startX > len(data[startY]):
      startX = startX - len(data[startY]) - 1
      startY = startY + 1
    (endX, endY) = (startX+len(word), startY)
    while endX > len(data[endY]):
      endX = endX - len(data[endY]) - 1
      endY = endY + 1

    # We've found the start and end coordinates.  Now store the information in a way that
    # it can be retrieved in order (without duplicates).  For this we'll use a dictionary
    # of dictionaries for the start points, and another nested pair for the end points.
    if not locations.has_key(startY):
      locations[startY] = {}
    locations[startY][startX] = (endY, endX)
  #print "<!-- DEBUG: locations =", locations, "-->"

  # Look for highlight markers that are separated by nothing more than whitespace.
  # In these cases, merge the two neighbouring highlights into one.
  # This block of code is optional, it just makes the output look better.
  rows = locations.keys()
  rows.sort()
  lasthighlight = ()
  for startY in rows:
    cols = locations[startY].keys()
    cols.sort()
    for startX in cols:
      (endY, endX) = locations[startY][startX]
      if lasthighlight:
        if lasthighlight[2] == startY:
          gap = data[startY][lasthighlight[3]:startX]
        else:
          gap = data[lasthighlight[2]][lasthighlight[3]:]
          if lasthighlight[2] == startY - 2:
            gap = gap + data[startY-1] # add the line in between
          elif lasthighlight[2] < startY -2:
            gap = "forget it" # more than one line in between, don't bother
          gap = gap + data[startY][:startX]
        if gap.lstrip() == '':
          #print "<!-- DEBUG: removing highlight gap from", (lasthighlight[2], lasthighlight[3]), "to", (startY, startX), "-->"
          locations[lasthighlight[0]][lasthighlight[1]] = (endY, endX) # expand the previous highlight
          del locations[startY][startX] # delete the current highlight (might leave an empty hash behind)
          (startY, startX) = (lasthighlight[0], lasthighlight[1])
      lasthighlight = (startY, startX, endY, endX)

  # We've located the starting and ending points where all the highlighting tags
  # need to be added.  The trick is to add them from the end, rolling forward.
  # That way we don't mess up our coordinates.
  rows = locations.keys()
  rows.sort()
  rows.reverse()
  for startY in rows:
    cols = locations[startY].keys()
    cols.sort()
    cols.reverse()
    for startX in cols:
      if highlighter_jump and startY == rows[-1] and startX == cols[-1]:
        highlighter_starttag = highlighter_starttag.replace('>', ' ID="highlighter_jump">', 1)
      (endY, endX) = locations[startY][startX]
      data[endY] = data[endY][:endX]+highlighter_endtag+data[endY][endX:]
      data[startY] = data[startY][:startX]+highlighter_starttag+data[startY][startX:]

  # Pull it all together, and we're done.
  data = string.join(data, "\n")


# Print the HTML page, complete with highlighting, but otherwise unmolested.
print data

# Add the JavaScript jump code if requested.
# Technically this should be printed inside the BODY tag.  But it isn't.
if highlighter_jump and highlighter_on and matches:
  print """
<SCRIPT TYPE="text/javascript" LANGUAGE="JavaScript1.2"><!--
var obj=null;
if(document.getElementById)
	obj=document.getElementById('highlighter_jump')
else if(document.all)
	obj=document.all['highlighter_jump']
var y=-50;
while(obj){y=y+obj.offsetTop; obj=obj.offsetParent;}
if(y>1) scrollTo(0, y);
//--></SCRIPT>
"""
