#!/usr/bin/python

from __future__ import nested_scopes

import sys
sys.path.insert(0, '/home/kragen/lib/python')
from twisted.protocols import irc
from twisted.internet import reactor, protocol
import time, re, cgi, os, string, urllib, stat

### tail -f stuff

def file_identity(struct_stat):
    return struct_stat[stat.ST_DEV], struct_stat[stat.ST_INO]

def followtail(filename, callback, freq=1, fileobj=None, fstat=None):
    if fileobj is None:
        fileobj = open(filename)
        fileobj.seek(0, 2)
    callback(fileobj.read())
    if fstat is None: fstat = os.fstat(fileobj.fileno())
    try: stat = os.stat(filename)
    except: stat = fstat
    if file_identity(stat) != file_identity(fstat):
        fileobj = open(filename)
        fstat = os.fstat(fileobj.fileno())
        # don't do this on new files: fileobj.seek(0, 2)
    reactor.callLater(freq, lambda: followtail(filename, callback, freq, 
                                               fileobj, fstat))

# search.yahoo uses p=, google and msn use q=
search_term_re = re.compile('[?&][pq]=([^&;"]*)')
def extract_search_terms(url):
    mo = search_term_re.search(url)
    if mo: return urllib.unquote_plus(mo.group(1))

class loghunk:
    def __init__(self): self.text = ''
    def ips_urls_and_referers(self):
        lines = self.text.split('\n')
	self.text = lines.pop()
	splitlines = [line.split() for line in lines]
	return [(fields[0], fields[6], fields[10]) for fields in splitlines if len(fields) > 10]
    def append(self, text): self.text += text

def dispatch_search_terms_periodically(afile, do_what):
    lh = loghunk()
    def when_more_data(more_data):
        lh.append(more_data)
	for ip, url, referer in lh.ips_urls_and_referers(): 
	    st = extract_search_terms(referer)
	    if st: do_what(st, ip, url)
    followtail(afile, when_more_data)

### bot stuff

# config parameters:
# channel, near the top
# "localhost", 6667, near the bottom: the IRC server to talk to

channel = '#referers'

class referer_bot(irc.IRCClient):
    def __init__(self, **args):
        self.last_timestamp = time.time()
        self.nickname = 'refererbot'
        self.realname = 'kragen+refererbot'
        self.versionName = 'refererbot'
        self.versionNum = '0'
	self.files_to_watch = args['files']
    def signedOn(self):
        self.join(channel)
	for site in self.files_to_watch.keys():
	    dispatch_search_terms_periodically(self.files_to_watch[site], self.site_got_referer('http://' + site))
    def site_got_referer(self, site):
        return lambda search_terms, ip, url: self.notice(channel, "[%s] led %s to %s%s" % (search_terms, ip, site, url))
    def noticed(self, user, channel, message): pass
    def privmsg(self, user, channel, message): pass

class botfactory(protocol.ClientFactory):
    def __init__(self, filedict):
	#protocol.ClientFactory.__init__(self)   doesn't work in old pythons
	self.protocol = lambda: referer_bot(files=filedict)
    def clientConnectionLost(self, connector, reason): connector.connect()
    def clientConnectionFailed(self, connector, reason): reactor.stop()

def main():
    filedict = {}
    for site in ['www.canonical.org', 'lists.canonical.org', 'www.bethmolnar.org', 'www.bethmolnarart.com', 'www.bentwookie.org', 
                 'blog.reinit.org', 'blog.simpleideas.org', 'lists.reinit.org', 'reinit.org', 'wiki.simpleideas.org', 'www.simpleideas.org']:
        filedict[site] = '/var/log/apache/%s-access.log' % site
	
    irccf = botfactory(filedict)
    #irccf.protocol = referer_bot(files=filedict)
    reactor.connectTCP("localhost", 6667, irccf)
    reactor.run()

if __name__ == "__main__": main()
