#!/usr/bin/python """The Crawler module for eGovMonHarvestMan Including new sampling algorithm """ # Copyright 2009-2010 eGovMon # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the eGovernment Monitoring (eGovMon) # # eGovMon is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # eGovMon is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with eGovMon; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA import sys reload(sys) sys.setdefaultencoding('utf-8') #http://trac.edgewall.org/ticket/5628 - Should fix all unicodedecodeerrors we have import re import SOAPpy import urlparse import urllib from logit import logit import sc import os # Warning: This imports memcache which imports timeoutsocket. However # timeoutsocket overrides the socket.socket factory function with its own # function which screws up HTTPS urls for HarvestMan. So before creating # HarvestMan objects we need to reset the socket.socket factory function # to original ! import socket gsocketf = socket.socket import egovmondb from egovmontime import * #import changefreq import psycopg2 import random import time import SOAPpy import urlparse import os import sc from socket import gethostname; from harvestman import * from harvestmanklass import eGovMonHarvestMan __author__ = "Anand B Pillai" __version__ = "0.1" __maintainer__ = "Terje Gjoesaeter, Morten Goodwin Olsen" import string def onlyascii(u): if u in string.ascii_letters: return u else: return '_' def sortstringlen(x,y): if random.uniform(0,1)<0.75: return cmp(len(x),len(y)) else: return cmp(len(y),len(x)) class Crawler: """The main Crawler class """ def sendToSampler(self): if not 'onlycrawl' in sys.argv: samplingserver = SOAPpy.SOAPProxy(self.sc.samplingserver) host = gethostname() samplingserver.loadSample(str(self.site),str(self.testrunid)) def __init__(self,site,testrunid): """initialise the class """ self.sc=sc.SystemConfiguration() #timeoutsocket.setDefaultSocketTimeout(None) self.siteurlserver = SOAPpy.SOAPProxy(urllib.unquote(self.sc.siteurlserver)) self.testrunid=testrunid self.site = site logit('Crawling site: '+self.site+' with testrunid '+self.testrunid,somemodule='Crawler',stdout=True) pid = os.getpid() self.egovmondb=egovmondb.eGovMonDB() #Checking if testrun exists if not self.egovmondb.getTestRunID(self.testrunid): logit('Cannot crawl site '+self.site+' because testrunid '+str(testrunid)+' is not initiated.',somemodule='Crawler',stdout=True) sys.exit(1) if not self.egovmondb.getSiteInTestrun(self.site,testrunid): logit('Cannot crawl site '+self.site+' because it is not added to the the testrun '+self.testrunid+'.',somemodule='Crawler',stdout=True) sys.exit(1) if self.egovmondb.getSiteFinished(self.site,self.testrunid): logit('Cannot crawl site '+self.site+' because results already exists on testrunid '+self.testrunid+'.',somemodule='Crawler',stdout=True) sys.exit(1) if self.egovmondb.getPagesDownloaded(self.site,self.testrunid)>=int(self.sc.pagestodownload): logit('Cannot crawl site '+self.site+' because already at least '+self.sc.pagestodownload+' has been detected on testrunid '+self.testrunid+'.',somemodule='Crawler',stdout=True) self.sendToSampler() sys.exit(1) if self.egovmondb.getPagesDownloaded(self.site,self.testrunid)>int(self.sc.minsamplecount): self.sendToSampler() self.hostname=gethostname() self.crawlerrdfmodel = 'nomater' self.crawlerconfigtemplate = self.sc.crawlerconfigtemplate self.webcachedirectory = self.sc.webcachedirectory self.configdirectory = self.sc.configdirectory self.egovmondb.currentTestRun = self.testrunid self.pid = os.getpid() #initialisation finished def startCrawling(self): """This function performs the necessary steps to complete a testrun; first writing start of testrun, then do the site selection and the crawling of the sites, and finally writing the last part of the testrun """ self.siteSelection(self.egovmondb) def siteSelection(self, egovmondb): """In this function, we get a list of sites to be crawled, create config files and perform the crawl for each site. Keyword arguments: egovmondb -- eGovMonDB """ #testrun=str(self.egovmondb.getCurrentTestRun(retint=True)) #Test for connecting to the URL repository #1. siteurl as string #2. testrun as integer #3. rdfmodel as string #4. etl-server with port as string #5. number of failing results for this model #6. timeoutcounter (internal use, candidate for delete). if self.site: t = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) logit('Crawling site '+self.site.strip(),somemodule='Crawler') try: conf,location=self.mkConfig(self.site,self.testrunid) except IOError,e: logit(str(e),somemodule='Crawler') #Error writing the configuration file else: print "Do the crawl for " + self.site pre = time.time() self.doCrawl(conf) site = urlparse.urlsplit(self.site)[1] sitefilename = ''.join([onlyascii(i) for i in str(site)]) #Note the following is a hack to be able to perform performance evaluations. logit('Crawl duration:' + str(time.time()-pre),'performance','crawler') print 'End of Crawl' #site = urlparse.urlparse(self.site)[1] #TODO: Verify that the following is correct logit('Finished crawling site:'+str(self.site)+' Number of pages detected by the crawler '+str(self.egovmondb.getPagesDownloaded(self.site,self.testrunid)),somemodule='Crawler',stdout=True) samplingserver = SOAPpy.SOAPProxy(self.sc.samplingserver) host = gethostname() if self.egovmondb.getPagesDownloaded(self.site,self.testrunid)>=int(self.sc.minpagessampled) or self.egovmondb.isSmallSite(self.site): self.sendToSampler() else: logit(str(self.site) + ' not sent to sampler because only '+str(self.egovmondb.getPagesDownloaded(self.site,self.testrunid))+' pages found ',somemodule='Crawler',stdout=True) self.egovmondb.close() def mkConfig(self, siteurl, testrun): """Create a configuration file for the harvestman project One config file per site, overwrite the file each crawl Naming: config- The structure of the web cache is now as follows: /storedfiles//// Keyword arguments: siteurl - url of the site to be crawled testrun - name of the testrun """ #Read the template templatefile = open(self.crawlerconfigtemplate) templatecontent = templatefile.readlines() templatefile.close() # Find values for variables in config fileself.rw,self.rdfg # Project name projectname=siteurl projectname= re.sub("http://|https://", "", projectname) projectname= re.sub("/", "_", projectname).strip('_') # Construct new basedir basedir=self.webcachedirectory+self.crawlerrdfmodel+"/"+self.testrunid+"/"+str(random.randint(1000,9999))+"/" #substitute values in config content #The following is to support domains both pure domain ad by URLs #thissiteurl = siteurl.lstrip('http://').rstrip('/') thissiteurl = siteurl if siteurl.startswith('http://'): thissiteurl = siteurl[7:] elif siteurl.startswith('https://'): thissiteurl = siteurl[8:] thissiteurl = thissiteurl.rstrip('/') tempsiteurl = self.egovmondb.getAllStartURLsFromSite(thissiteurl) if tempsiteurl: random.shuffle(tempsiteurl) #tempsiteurl.sort(sortstringlen) siteurl = urllib.quote(tempsiteurl[-1],safe=':/') lastmilehack = self.sc.lastmilehack templatecontent=self.replaceString(templatecontent, "##lastmilehack##", lastmilehack) templatecontent=self.replaceString(templatecontent, "##url##", siteurl) templatecontent=self.replaceString(templatecontent, "##name##", projectname) templatecontent=self.replaceString(templatecontent, "##sitereference##", thissiteurl) templatecontent=self.replaceString(templatecontent, "##baseDir##", basedir) templatecontent=self.replaceString(templatecontent, "##proxyServer##", urlparse.urlparse(urllib.unquote(self.sc.webproxy))[1]) templatecontent=self.replaceString(templatecontent, "##proxyUser##", self.sc.webproxyuser) templatecontent=self.replaceString(templatecontent, "##proxyPassword##", self.sc.webproxypassword) templatecontent=self.replaceString(templatecontent, "##proxyPort##", urlparse.urlparse(urllib.unquote(self.sc.webproxy))[2]) #Construct config file name configfilename=self.configdirectory+"config-"+projectname+".xml" configfile=open(configfilename, 'w') for configline in templatecontent: configfile.write(configline) configfile.close return (configfilename,basedir) def doCrawl(self, configfile): """Start a HarvestMan crawl with the specified config and log file Keyword arguments: configfile - name of the config file """ prepare(configfile) # Reset socket factory function to original else you get the dreaded # error message "ssl() argument 1 must be _socket.socket, not _socketobject"! socket.socket = gsocketf crawler = eGovMonHarvestMan() print 'Config:',configfile #Give various needed instances to crawler here crawler.setInstances(self.egovmondb,self.siteurlserver) crawler.main() #Flushing outputs to help cron job watchdogs see if there is activity try: self.exceedmemory = crawler._cfg.exceedmemory except (NameError,AttributeError): self.exceedmemory = False if crawler._cfg.exhaustivescan: self.exhaustivescan = False else: self.exhaustivescan = True sys.stdout.flush() sys.stderr.flush() def replaceString(self, content, stringtoreplace, replacestring): """Replaces an occurence of stringtoreplace with replacestring in mulitlined content content - multilined content to have string replaced stringtoreplace - the string which is to be replaced replacestring - the new string Returns the modified string """ return [s.replace(stringtoreplace, replacestring) for s in content] if __name__ == "__main__": print 'Starting crawl' try: testrunid = sys.argv[-1] site = sys.argv[-2] except IndexError: print "Usage: python crawlerwrapper site testrunid" sys.exit(1) cr=Crawler(site,testrunid) cr.startCrawling() sys.stdout.flush() if 'memanalysis' in sys.argv: time.sleep(60)#Making sure all logs are emptied within this time... Most likely a much better way to do this... import os from sizer import scanner from sizer import annotate from sizer import formatting import xmlrpclib from sizer import graph import sys from sizer import operations objs = scanner.Objects() #mods = annotate.simplegroupby(objs, classes=True) #formatting.printsizes(mods) formatting.printsizes(objs, count=100) #prestdout = sys.stdout #sys.stdout = open(cr.sco.loglocation + 'memory.log','w') #somegraph = graph.makegraph(objs, count = 15, proportional = True) #os.system('cp %(somegraph)s ./graph.ps'%locals()) nostr = operations.fix(operations.filterouttype(objs, str)) formatting.printsizesop(operations.bytype(objs), threshold = 1000) lists = operations.filtertype(objs, list) formatting.printsizesop(operations.bysize(lists)) a = [(i.size,i.type,i) for i in objs.values() if i.type==type({})] a.sort() a = a[-50:] print 'Dicionaries:' for size,type,i in a: print i print ' type:',type print ' size:',size try: print ' parent:',i.parent except: pass try: print ' parents:',i.parents except: pass print ' children:'#,i.children for item in i.children: print ' ',item #sys.stdout = prestdout