# -*- coding: UTF-8 -*- """Extracting information from web pages Id$ """ # Copyright 2008,2009 eGovMon # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the eGovernment Monitoring # (eGovMon) # # eGovMon is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # eGovMon is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with eGovMon; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA __author__ = "$Author$" __version__ = "$Revision$" __updated__ = "$LastChangedDate$" import re import string from stopwords import * from urls import * titletag = re.compile(r'(.*?)',re.DOTALL) h1tag = re.compile(r'(.*?)',re.DOTALL) tags = re.compile(r'<.*?>',re.DOTALL) #nontags = re.compile(r'>.*?<',re.DOTALL) attributevalues = re.compile(r'"(.*?)"',re.DOTALL) nonattributevalues = re.compile(r'(.*?)".*"(.*?)',re.DOTALL) #Several regular expressions are faster then advanced dateone = re.compile(r'[0-9][0-9][0-9][0-9]\.[0-9][0-9]\.[0-9][0-9]') datetwo = re.compile(r'[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9]') datethree = re.compile(r'[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9]') #All numbers between 1 and 31 (Month Numbers) largemonths = re.compile(r'[^0-9]([0-2][0-9]|[3][0-1])') smallmonths = re.compile(r'[^0-9]([1-9])[^0-9]') #Text months months = re.compile(r'(januar|februar|mars|april|mai|juni|juli|august|september|oktober|november|desember)') #Text Days days = re.compile(r'(mandag|måndag|tirsdag|tysdag|onsdag|torsdag|fredag|lørdag|laurdag|søndag|syndag)') #Mail record workds words = re.compile(r'(postliste|postjournal|innsyn|postlister)') #Text links textlinks = re.compile(r'(.*?)',re.DOTALL) def getH1Text(data): data = data.lower() if '')].replace(' ',' ') retval = retval[retval.find('>'):] if len(retval)>1000000: return [] return ''.join([i for i in retval if i in string.ascii_lowercase+'åæø 1234567890']).split() else: return [] def getLinkTextPDF(data): data = ' ' +data.lower() if '')].replace(' ',' ') if '.pdf"' in thisdata: thisdata = thisdata[thisdata.find('>'):] if not len(thisdata)>10000: thisdata = ' '.join(getOnlyWords(thisdata)) retval += [i for i in ''.join([i for i in thisdata if i in string.ascii_lowercase+'åæø 1234567890']).split()] return retval else: return [] def getDateLinks(data): data = ' ' +data.lower() if '')].replace(' ',' ') if datethree.findall(thisdata): thisdata = thisdata[thisdata.find('>'):] if not len(thisdata)>10000: thisdata = ' '.join(getOnlyWords(thisdata)) retval += [i for i in ''.join([i for i in thisdata if i in string.ascii_lowercase+'åæø ']).split()] return retval else: return [] def getLinkText(data): data = ' ' +data.lower() if '')].replace(' ',' ') thisdata = thisdata[thisdata.find('>'):] if not len(thisdata)>10000: thisdata = ' '.join(getOnlyWords(thisdata)) retval += [i for i in ''.join([i for i in thisdata if i in string.ascii_lowercase+'åæø 1234567890']).split()] return retval else: return [] def getBText(data): data = data.lower() if '')].replace(' ',' ') retval = retval[retval.find('>'):] if len(retval)>10000: return [] return ''.join([i for i in retval if i in string.ascii_lowercase+'åæø 1234567890']).split() else: return [] def getTitleText(data): data = data.lower() if '' in data and '' in data: retval = data[data.find('')+7:data.find('')].replace(' ',' ').replace('.',' ') return ''.join([i for i in retval if i in string.ascii_lowercase+'åæø 1234567890']).split() else: return [] #try: # return ' '.join(titletag.search(data).groups()[0]).lower().replace(' ',' ').replace('-',' ').split() #except AttributeError: # return [] #return [i.lower() for i in titletag.findall(data)+h1tag.findall(data)] def getAllData(data,domain,type): data = getTitleText(data)#removestopwords(getFilename(domain,type) + getAttributes(domain,type))# removestopwords(getTitleText(data) + getH1Text(data) + getFilename(domain,type))# + getLinkTextPDF(data) + getDateLinks(data)) return data def getProtocol(domain,type): protocol = getURLInfo(domain,type) or ('',) return [protocol[0]] def getAttributes(domain,type): if type=='p' and domain=='www.midsund.kommune.no': import pdb #pdb.set_trace() attributes = getURLInfo(type,domain) if not attributes or not (attributes[2]+attributes[3]+attributes[4]).strip('/').strip('.').strip('&').strip('='): return [''] else: return [i for i in (attributes[2]+attributes[3]+attributes[4]).replace('2','').replace('-',' ').replace('?',' ').replace('/',' ').replace('.',' ').replace('&',' ').replace('&',' ').replace('=',' ').split() if i in ('postliste','innsyn') or 'journal' in i] def getFilename(domain,type): filename = getURLInfo(type,domain) if not filename or not filename[2].strip('/').strip('.'): return [''] else: return filename[2].replace('/',' ').replace('.',' ').split() def getOnlyWords(data): return ' '.join([i.strip() for i in tags.split(data.replace(' ',' ').replace('å','å')) if i.strip()]).split() #+ getAttributeValues(data) def getAttributeValues(data): return ' '.join([i.lower() for i in attributevalues.split(data) if '=' not in i and '>' not in i and '<' not in i and 'http://' not in i and '&' not in i and '#' not in i and len(i)<100]).split() def getAllProperdates(data): return dateone.findall(data)+datetwo.findall(data)+datethree.findall(data) def getUniqueDates(data): somedates = dateone.findall(data)+datetwo.findall(data)+datethree.findall(data) return [i for i in somedates if somedates.count(i)==1] def getAllMonths(data): months = [str(i) for i in range(1,32)]+['01','02','03','04','05','06','07','08','09'] return [i for i in data.split() if i in months] #return largemonths.findall(data) + smallmonths.findall(data) def getMonthNames(data): return months.findall(data) def getCommonWords(data): return words.findall(data) def getWeekdays(data): return days.findall(data) def getTextLinks(data): return textlinks.findall(data)