Forums

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 15: ordinal not in range(128)

I am new to python scrapping. I have taken this code quite long ago from this website. I am trying to change some stuff so that i can my self crawl links from google and then index them and stored them. But problem is that i am facing strange error i dont know why this is occuring.

Kindly help. My code (i am also placing link of file: http://www.uploadmb.com/dw.php?id=1455203992)

#!/usr/bin/env python
try:
    import psyco
    psyco.full()
except:
    pass
from pylab import*
from scipy import stats
from copy import deepcopy

import urllib
from urllib2 import unquote
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
import sqlite3 as sqlite
stem = False
if stem: import pracstem

# There are two lines,110 and 138, in this program that make use of this escape character as the condition. Uncomment both of them.
# Add more unwanted characters in url. 
#escapeChar=['add some characters here']

# Create a list of words to ignore
ignorewords = file('stopwords').read().split('\n')
ignorewords = set([word for word in ignorewords if len(word) > 2])
iter = 20
### CLASS CRAWLER STARTS HERE###
class crawler (object):

  # Initialize the crawler with the name of database
    def __init__(self, dbname):
        self.con = sqlite.connect(dbname)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    def dynamicpage(self, page):
        if '?' or '%' in page: return True
        else: return False

    def contentlength(self, soup):
        content = ''
        for paragraph in soup('p'):
            for node in paragraph.contents:
                if type(node)!=None:
                    content += str(node.string.encode('utf-8'))
                    content += ' '
        return len(content)

    # Auxilliary function for getting an entry id and adding
    # it if it's not present
    def getentryid(self, table, field, value, createnew = True):
        cur = self.con.execute ("select rowid from %s where %s = '%s'" %(table, field, value))
        res = cur.fetchone()
        if res is None:
            cur = self.con.execute("insert into %s (%s) values('%s')" %(table, field, value))
            return cur.lastrowid
        else:
            return res[0]

  # Index an individual page
    def addtoindex(self, url, soup):
        if self.isindexed (url): return
        print 'Indexing ' + url
        # Get the individual words
        text = self.gettextonly(soup)
        words = self.separatewords(text)
        if stem: words = pracstem.stem(words)
        # Get the URL id
        urlid = self.getentryid('urllist', 'url', url)
        # Link each word to this url
        for i in range(len(words)):
            word = words[i]
            if word in ignorewords: continue
            wordid = self.getentryid('wordlist', 'word', word)
            self.con.execute("insert into wordlocation(urlid, wordid, location) values(%i, %i, %i)" %(urlid, wordid, i))

  # This function will only index title and paragraph content of the page
    def addtoindexnew(self, url, soup):
        if self.isindexed (url): return
        print 'Indexing ' + url
        #Get only contents of each paragraph
        content = ''
        for paragraph in soup('p'):
            for node in paragraph.contents:
                if type(node)!=None:
                    content += str(node.string).encode('utf-8')
                    content += ' '
        #Get also title of the page
        title = soup('title')[0].string
        #title + content
        text = title + ' ' + content
        # Get the individual words
        words = self.separatewords(text)
        if stem: words = pracstem.stem(words)
        # Get the URL id
        urlid = self.getentryid('urllist', 'url', url)
        # Link each word to this url
        for i in range(len(words)):
            word = words[i]
            if word in ignorewords: continue
            wordid = self.getentryid('wordlist', 'word', word)
            self.con.execute("insert into wordlocation(urlid, wordid, location) values(%i, %i, %i)" %(urlid, wordid, i))


  # Extract the text from an HTML page (no tags)
    def gettextonly(self, soup):
        v = soup.string
        if v is None:
            c = soup.contents
            resulttext = ''
            for t in c:
                subtext = self.gettextonly(t)
                resulttext += subtext + '\n'
            return resulttext
        else: return v.strip()

  # Separate the words by any non-whitespace character
    def separatewords(self, text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if len(s)>2 and len(s)<20]

  # Return true if this url is already indexed
    def isindexed(self, url):
        u = self.con.execute("select rowid from urllist where url = '%s'" %url).fetchone()
        if u is not None:
            # Check if it has actually been crawled
            v = self.con.execute('select * from wordlocation where urlid = %i' %u[0]).fetchone ()
            if v is not None: return True
        return False

  # Add a link between two pages
    def addlinkref(self, urlFrom, urlTo, linkText):
        words = self.separatewords(linkText)
        if stem: words = pracstem.stem(words)
        fromid = self.getentryid('urllist', 'url', urlFrom)
        toid = self.getentryid('urllist', 'url', urlTo)
        if fromid == toid: return
        cur = self.con.execute("insert into link(fromid, toid) values(%i, %i)" %(fromid, toid))
        linkid = cur.lastrowid
        for word in words:
            if word in ignorewords: continue
            wordid = self.getentryid('wordlist', 'word', word)
            self.con.execute("insert into linkwords(linkid, wordid) values(%i, %i)" %(linkid, wordid))

    # Starting with a list of pages, do a breadth
    # First search to the given depth, indexing pages as we go
    def crawl(self, pages, depth = 2):
        d = requests.get("https://www.google.dz/search?q="+pages)
        for i in range(depth):
            newpages = set()

                # if (escapeChar[i] in page for i in range(len(escapeChar))): continue  #Uncomment to activate this condition
                #try:


                    #urllib2.urlopen(page)

                #except: 
                 #   print "Could not open %s" %page
                 #   continue

                # try:



                # except: 
                #    print "Couldn't Beauticate %s" %page
                #    continue
                # See if the page content long enough 
#           if self.contentlength(soup) < 500:
#                continue
                # Find the hyperlinks and index its anchor text
            self.createindextables()
            soup = BeautifulSoup(d.content)
            #try: 
            self.addtoindexnew(page, soup)  #Index the content of the page
            #except:
             #       print "Could't index this page"
              #      continue

            url = soup.findAll("a")
            for link in  url:
                if link['href'].startswith('/url?q=') \
                and 'webcache.googleusercontent.com' not in link['href']:
                    print link['href'].split('/url?q=')[1].split('&')[0]
                        # if (escapeChar[i] in url for i in range(len(escapeChar))): continue       #Uncomment to activate this condition
                        # If both page and hyperlink on it are dynamic page, do not add the link to the queue


                    if self.dynamicpage(page) and self.dynamicpage(url): continue
                    if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url)
                    linkText = self.gettextonly(link)
                    self.addlinkref(page, url, linkText)
                    try: 
                        self.addtoindexnew(page, soup)  #Index the content of the page
                    except:
                        print "Could't index this page"
                        continue

            self.dbcommit()
            pages = newpages

    def createindextables(self):
        self.con.execute('create table urllist(url text)')
        self.con.execute('create table wordlist(word text)')
        self.con.execute('create table wordlocation(urlid integer, wordid integer, location integer)')
        self.con.execute('create table link(fromid integer, toid integer)')
        self.con.execute('create table linkwords(linkid integer, wordid integer)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.con.execute('create index linkwordidx on linkwords(wordid)')
        self.con.execute('create index linkidx on linkwords(linkid)')
        self.dbcommit()

    def calculatelength(self):
        self.con.execute("drop table if exists pagelength")
        self.con.execute("create table pagelength(urlid integer primary key, length integer)")
        self.con.execute("insert into pagelength select rowid, 0 from urllist")
        self.dbcommit()
        for (urlid,) in self.con.execute('select rowid from urllist'):
            loc = self.con.execute("select location from wordlocation where urlid=%i" %urlid)
            locs = [id for id in loc]
            if len(locs) == 0: continue
            length = locs[len(locs)-1][0]
            self.con.execute("update pagelength set length = %i where urlid = %i" %(length, urlid))
        self.dbcommit()


    def calculatepagerank(self, iter):
        # Clear out the current pagerank tables
        self.con.execute('drop table if exists pagerank')
        # Create pagerank table
        self.con.execute('create table pagerank(urlid integer primary key, score float)')
        # Clear out the current prconvergence table
        self.con.execute('drop table if exists prconvergence')
        # Create prconvergence table
        self.con.execute('create table prconvergence(residu float)')
        # Calculate number of pages in the database and create array that store current ranking vector
        n = self.con.execute('select count(*) from urllist').fetchone()[0]

        # Initialize every url with a PageRank of PR0
        pr0 = (1.0/n)
        self.con.execute('insert into pagerank select rowid, %f from urllist' %pr0)
        self.dbcommit()
        # create array that store current scores
        currentscores = zeros(n)
        # Start the iterations
        for i in range(iter):
            prevscores = deepcopy(currentscores)
            print "Iteration %i" %(i)
            for (urlid,) in self.con.execute('select rowid from urllist'):
                pr = 0.15/n
                # Loop through all the pages that link to this one
                for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                    # Get the PageRank of the linker
                    linkingpr = self.con.execute('select score from pagerank where urlid = %i' %linker).fetchone()[0]
                    # Get the total number of links from the linker
                    linkingcount = self.con.execute('select count(*) from link where fromid = %i' %linker).fetchone()[0]
                    pr += 0.85*linkingpr/linkingcount
                currentscores[urlid-1] = pr
                self.con.execute('update pagerank set score = %f where urlid = %i' %(pr, urlid))
            residu = sum(abs(currentscores-prevscores))
            self.con.execute('insert into prconvergence(residu) values(%f)'%residu)
            self.dbcommit()

    def calculatepagerank2(self, iter):
        # Clear out the current pagerank tables
        self.con.execute('drop table if exists pagerank')
        # Create pagerank table
        self.con.execute('create table pagerank(urlid integer primary key, score float)')
        # Clear out the current prconvergence table
        self.con.execute('drop table if exists prconvergence')
        # Create prconvergence table
        self.con.execute('create table prconvergence(residu float)')
        # Calculate number of pages in the database and create array that store current ranking vector
        n = self.con.execute('select count(*) from urllist').fetchone()[0]

        # Initialize every url with a PageRank of PR0
        pr0 = 1.0/n
        self.con.execute('insert into pagerank select rowid, %f from urllist' %pr0)
        self.dbcommit()
        # create array that store current scores
        currentscores = zeros(n)
        # Start the iterations
        for i in range(iter):
            prevscores = deepcopy(currentscores)
            dscore = 0.0
            print "Iteration %i" %(i)

            # Score from dangling nodes
            for (urlid,) in self.con.execute('select rowid from urllist'):
                if self.con.execute('select val from dnode where id = %i' %urlid).fetchone() == 1:
                    dscore += self.con.execute('select score from pagerank where urlid = %i' %urlid).fetchone()

            for (urlid,) in self.con.execute('select rowid from urllist'):
                pr = (0.15 + 0.85*dscore)/n
                # Loop through all the pages that link to this one
                for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                    # Get the PageRank of the linker
                    linkingpr = self.con.execute('select score from pagerank where urlid = %i' %linker).fetchone()[0]
                    # Get the total number of links from the linker
                    linkingcount = self.con.execute('select count(*) from link where fromid = %i' %linker).fetchone()[0]
                    pr += 0.85*linkingpr/linkingcount
                currentscores[urlid-1] = pr
                self.con.execute('update pagerank set score = %f where urlid = %i' %(pr, urlid))
            residu = sum(abs(currentscores-prevscores))
            self.con.execute('insert into prconvergence(residu) values(%f)'%residu)
            self.dbcommit()

    def calculatemypagerank(self, iter):
        # Clear out the current mypagerank table
        self.con.execute('drop table if exists mypagerank')
        # Create mypagerank table
        self.con.execute('create table mypagerank(urlid integer primary key, score float, cp float)')
        # Clear out the current myprconvergence table
        self.con.execute('drop table if exists myprconvergence')
        # Create myprconvergence table
        self.con.execute('create table myprconvergence(residu float)')
        # Calculate number of pages in the database and create array that store current ranking vector
        n = self.con.execute('select count(*) from urllist').fetchone()[0]
        mpr0 = 1.0/n
        # Initialize every url with a mypagerank and cp  
        for (urlid,) in self.con.execute('select rowid from urllist'):
            self.con.execute("insert into mypagerank(urlid, score, cp) values (%i, %f, %f)" %(urlid, mpr0, 1.0))
        self.dbcommit()
        # Find number of inlinks and outlinks of each url
        for (urlid,) in self.con.execute('select rowid from urllist'):
            inlinks = self.con.execute('select count (*) from link where toid = %i' %urlid).fetchone()[0]
            outlinks = self.con.execute('select count (*) from link where fromid = %i' %urlid).fetchone()[0]
            if outlinks == 0: outlinks = 0.1
            if inlinks > outlinks: p = 1
            elif inlinks < outlinks: p = -1
            else: p = 0
            # Calculate cp
            cp = abs (inlinks - outlinks)**p
            cp *= (inlinks + outlinks)*1/float(outlinks)
            self.con.execute('update mypagerank set cp = %f where urlid = %i' %(cp, urlid))
        self.dbcommit()

        # create array that store current ranking vector
        currentscores = zeros(n)
        #Start the score calculation
        for i in range(iter):
            prevscores = deepcopy(currentscores)
            print "Iteration %i" %(i)
            for (urlid,) in self.con.execute('select rowid from urllist'):
                score = 0.15/n
                # Loop through all the pages that link to this one
                for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                    # Get linker score from mypagerank
                    linkingscore = self.con.execute('select score from mypagerank where urlid = %i' %linker).fetchone()[0]
                    # Get linker cp from mypagerank
                    linker_cp = self.con.execute('select cp from mypagerank where urlid = %i' %linker).fetchone()[0]
                    score += 0.85*linkingscore*linker_cp
                currentscores[urlid-1] = score
                self.con.execute('update mypagerank set score = %f where urlid = %i' %(score, urlid))
            residu = sum(abs(currentscores-prevscores))
            self.con.execute('insert into myprconvergence(residu) values(%f)'%residu)
            self.dbcommit()

    def calculatehits (self, iter):
        # Clear out current auth_hits tables if already existed
        self.con.execute('drop table if exists auth_hits')
        # Create auth_hits tables
        self.con.execute('create table auth_hits(urlid integer primary key, score float)')
        # Clear out current hub_hits tables if already existed
        self.con.execute('drop table if exists hub_hits')
        # Create hub_hits tables
        self.con.execute('create table hub_hits(urlid integer primary key, score float)')
        # Clear out current auth_hitsconvergence tables if already existed
        self.con.execute('drop table if exists auth_hitsconvergence')
        # Create auth_hitsconvergence tables
        self.con.execute('create table auth_hitsconvergence(residu float)')
        # Clear out current hub_hitsconvergence tables if already existed
        self.con.execute('drop table if exists hub_hitsconvergence')
        # Create hub_hitsconvergence tables
        self.con.execute('create table hub_hitsconvergence(residu float)')
        # calculate number of pages in database
        n = self.con.execute('select count(*) from urllist').fetchone()[0]
        # initial value
        intval = 1.0/n
        # Initialize every url with a authority and hub scores
        self.con.execute('insert into auth_hits select rowid, %f from urllist' %intval)
        self.con.execute('insert into hub_hits select rowid, %f from urllist' %intval)
        self.dbcommit()

        auth_currentscores = zeros(n)
        hub_currentscores = zeros(n)
        # Start the iterations
        for i in range(iter):
            agg_authScore = 0.0
            agg_hubScore = 0.0
            print "Iteration %i" %(i)
            # Authority part
            auth_prevscores = deepcopy(auth_currentscores)
            for (urlid,) in self.con.execute ('select rowid from urllist'):
                authScore = 0.0
                # Loop through all the pages that link to this one
                for (linkerHub,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                    # Get the hub scores of the linker
                    linker_hubScore = self.con.execute('select score from hub_hits where urlid = %i' %linkerHub).fetchone()[0]
                    authScore += linker_hubScore
                self.con.execute('update auth_hits set score = %f where urlid = %i' %(authScore, urlid))
                agg_authScore += authScore
            for (urlid,) in self.con.execute('select rowid from urllist'):
                NauthScore = self.con.execute('select score from auth_hits where urlid = %i' %urlid).fetchone()[0]
                normalized_authScore = NauthScore*1/agg_authScore
                auth_currentscores[urlid-1] = normalized_authScore
                self.con.execute('update auth_hits set score = %f where urlid = %i' %(normalized_authScore, urlid))
            self.dbcommit()
            # Hub part
            hub_prevscores = deepcopy(hub_currentscores)            
            for (urlid,) in self.con.execute('select rowid from urllist'):
                hubScore = 0.0
                # Loop through all the pages that being linked by this one
                for (linkerAuth,) in self.con.execute('select distinct toid from link where fromid = %i' %urlid):
                    # Get the authority scores of the linker
                    linker_authScore = self.con.execute('select score from auth_hits where urlid = %i' %linkerAuth).fetchone()[0]
                    hubScore += linker_authScore
                self.con.execute('update hub_hits set score = %f where urlid = %i' %(hubScore, urlid))
                agg_hubScore += hubScore
            for (urlid,) in self.con.execute('select rowid from urllist'):
                NhubScore = self.con.execute('select score from hub_hits where urlid = %i' %urlid).fetchone()[0]
                normalized_hubScore = NhubScore*1/agg_hubScore
                hub_currentscores[urlid-1] = normalized_hubScore
                self.con.execute ('update hub_hits set score = %f where urlid = %i' %(normalized_hubScore, urlid))
            auth_res = sum(abs(auth_currentscores-auth_prevscores))
            self.con.execute('insert into auth_hitsconvergence(residu) values(%f)'%auth_res)
            hub_res = sum(abs(hub_currentscores-hub_prevscores))
            self.con.execute('insert into hub_hitsconvergence(residu) values(%f)'%hub_res)
            self.dbcommit()

    def calculatemyhits(self, iter):
        # Clear out current auth_myhits table if have already existed
        self.con.execute('drop table if exists auth_myhits')
        # Create auth_myhits table
        self.con.execute('create table auth_myhits(urlid integer primary key, score float, ca float)')
        # Clear out current hub_myhits table if have already existed
        self.con.execute('drop table if exists hub_myhits')
        # Create hub_myhits table
        self.con.execute('create table hub_myhits(urlid integer primary key, score float, ch float)')
        # Clear out current auth_myhitsconvergence tables if already existed
        self.con.execute('drop table if exists auth_myhitsconvergence')
        # Create auth_myhitsconvergence tables
        self.con.execute('create table auth_myhitsconvergence(residu float)')
        # Clear out current hub_myhitsconvergence tables if already existed
        self.con.execute('drop table if exists hub_myhitsconvergence')
        # Create hub_myhitsconvergence tables
        self.con.execute('create table hub_myhitsconvergence(residu float)')
        # calculate number of pages in database
        n = self.con.execute('select count(*) from urllist').fetchone()[0]
        # initial value
        intval = 1.0/n
        # Initialize every url with a authority and hub scores
        for (urlid,) in self.con.execute('select rowid from urllist'):
            self.con.execute("insert into auth_myhits(urlid, score, ca) values(%i, %f, %f)" %(urlid, intval, 1.0))
            self.con.execute("insert into hub_myhits(urlid, score, ch) values(%i, %f, %f)" %(urlid, intval, 1.0))
        self.dbcommit()
        # Find number of inlinks and outlinks of each url
        for (urlid,) in self.con.execute('select rowid from urllist'):
            inlinks = self.con.execute('select count(*) from link where toid = %i' %urlid).fetchone()[0]
            outlinks = self.con.execute('select count(*) from link where fromid = %i' %urlid).fetchone()[0]
            if inlinks > outlinks: p = 1
            elif inlinks < outlinks: p = -1
            else: p = 0
            # Calculate constant for authority part
            ca = abs (inlinks - outlinks)**p
            ca *= float(inlinks)*1/(inlinks + outlinks)
            if ca == 0.0: ca = 0.00001
            # Calculate constant for hub part
            ch = abs (inlinks - outlinks)**(-p)
            ch *= float(outlinks)*1/(inlinks + outlinks)
            if ch == 0.0: ch = 0.00001
            self.con.execute('update auth_myhits set ca = %f where urlid = %i' %(ca, urlid))
            self.con.execute('update hub_myhits set ch = %f where urlid = %i' %(ch, urlid))
        self.dbcommit()

        auth_currentscores = zeros(n)
        hub_currentscores = zeros(n)
        # Start the scores calculation process
        for i in range(iter):
            agg_authScore = 0.0
            agg_hubScore  = 0.0
            print "Iteration %i" %(i)
            # Authority part
            auth_prevscores = deepcopy(auth_currentscores)
            for (urlid,) in self.con.execute('select rowid from urllist'):
                authScore = 0.0
                # Loop through all the pages that link to this one
                for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                    # Get the hub scores and ch of the linker
                    linker_hubScore = self.con.execute('select score from hub_myhits where urlid = %i' %linker).fetchone()[0]
                    linker_ch = self.con.execute ('select ch from hub_myhits where urlid = %i' %linker).fetchone()[0]
                    authScore += linker_hubScore*linker_ch
                self.con.execute ('update auth_myhits set score = %f where urlid = %i' %(authScore, urlid))
                agg_authScore += authScore
            for (urlid,) in self.con.execute('select rowid from urllist'):
                NauthScore = self.con.execute('select score from auth_myhits where urlid = %i' %urlid).fetchone()[0]
                normalized_authScore = NauthScore*1/agg_authScore
                auth_currentscores[urlid-1] = normalized_authScore
                self.con.execute('update auth_myhits set score = %f where urlid = %i' %(normalized_authScore, urlid))
            self.dbcommit()
            # Hub part
            hub_prevscores = deepcopy(hub_currentscores)
            for (urlid,) in self.con.execute('select rowid from urllist'):
                hubScore = 0.0
                # Loop through all the pages that being linked by this one
                for (linker,) in self.con.execute('select distinct toid from link where fromid = %i' %urlid):
                    # Get the authority scores and ca of the linker
                    linker_authScore = self.con.execute('select score from auth_myhits where urlid = %i' %linker).fetchone()[0]
                    linker_ca = self.con.execute('select ca from auth_myhits where urlid = %i' %linker).fetchone()[0]
                    hubScore += linker_authScore*linker_ca
                self.con.execute('update hub_myhits set score = %f where urlid = %i' %(hubScore, urlid))
                agg_hubScore += hubScore
            for (urlid,) in self.con.execute('select rowid from urllist'):
                NhubScore = self.con.execute('select score from hub_myhits where urlid = %i' %urlid).fetchone()[0]
                normalized_hubScore = NhubScore*1/agg_hubScore
                hub_currentscores[urlid-1] = normalized_hubScore
                self.con.execute('update hub_myhits set score=%f where urlid = %i' %(normalized_hubScore, urlid))
            auth_res = sum(abs(auth_currentscores-auth_prevscores))
            self.con.execute('insert into auth_myhitsconvergence(residu) values(%f)'%auth_res)
            hub_res = sum(abs(hub_currentscores-hub_prevscores))
            self.con.execute('insert into hub_myhitsconvergence(residu) values(%f)'%hub_res)
            self.dbcommit()

    def calculateall(self):
        self.calculatepagerank(iter)
        #self.calculatemypagerank(iter)
        self.calculatehits(iter)
        self.calculatemyhits(iter)
        #self.calculatelength()

    def plotconvergence(self, name = 'http://www.britannica.com/blogs/'):
        x = arange(iter)

        prscores = []
        for (residu, ) in self.con.execute('select residu from prconvergence'):
            prscores.append(residu)

        authscores = []
        for (residu, ) in self.con.execute('select residu from auth_hitsconvergence'):
            authscores.append(residu)

        myauthscores = []
        for (residu, ) in self.con.execute('select residu from auth_myhitsconvergence'):
            myauthscores.append(residu)

        semilogy(x, prscores, 'b:s', x, authscores, 'r:o', x, myauthscores, 'g--d')
        xlabel('Iterations')
        ylabel('Error (log scale)')
        title('Convergence Rate (' + name + ')')
        legend(('PageRank','HITS (authority)','modified HITS (authority)'))
        grid(True)
        show()

    def fact(self):
        # Number of nodes in the network
        nodeNum = self.con.execute('select count(*) from urllist').fetchone()[0]
        # Non-zero entries
        nonZero = 0
        for (id,) in self.con.execute('select linkid from linkwords'):
            nonZero += 1

        # SIMILARITY MEASURES
        # Calculate Array of ranking vector
        prvector = []                     # PageRank vector
        for (score, ) in self.con.execute('select score from pagerank'):
            prvector.append(score)
        authvector = []                   # HITS vector
        for (score, ) in self.con.execute('select score from auth_hits'):
            authvector.append(score)
        myauthvector = []                 # modified HITS vector
        for (score, ) in self.con.execute('select score from auth_myhits'):
            myauthvector.append(score)
        inboundlink = []
        for (urlid,) in self.con.execute('select rowid from urllist'):
            count = 0
            for (linker,) in self.con.execute('select distinct fromid from link where toid = %i' %urlid):
                count += 1
            inboundlink.append(count)     # Inboundlink vector
        # Change lists into arrays
        prvector     = array(prvector)
        authvector   = array(authvector)
        myauthvector = array(myauthvector)
        inboundlink  = array(inboundlink)

        ###################################
        # Similarity between PageRank and authority part of HITS
        # Cosine criterion
        num = sum(prvector*authvector)
        den = sqrt(sum(prvector*prvector))*sqrt(sum(authvector*authvector))
        simCosPH = float(num)/den
        # Spearman rank order correlation coefficient criterion
        spPH = stats.spearmanr(prvector, authvector)[0]
        # Kendall's Tau rank order correlation coefficient criterion
        ktPH = stats.kendalltau(prvector, authvector)[0]

        # Similarity between PageRank and authority part of modified HITS
        # Cosine criterion
        num = sum(prvector*myauthvector)
        den = sqrt(sum(prvector*prvector))*sqrt(sum(myauthvector*myauthvector))
        simCosPmH = float(num)/den
        # Spearman rank order correlation coefficient criterion
        spPmH = stats.spearmanr(prvector, myauthvector)[0]
        # Kendall's Tau rank order correlation coefficient criterion
        ktPmH = stats.kendalltau(prvector, myauthvector)[0]

        # Similarity between PageRank and Inboundlink
        # Cosine criterion
        num = sum(prvector*inboundlink)
        den = sqrt(sum(prvector*prvector))*sqrt(sum(inboundlink*inboundlink))
        simCosPI = float(num)/den
        # Spearman rank order correlation coefficient criterion
        spPI = stats.spearmanr(prvector, inboundlink)[0]
        # Kendall's Tau rank order correlation coefficient criterion
        ktPI = stats.kendalltau(prvector, inboundlink)[0]

        ###################################     
        # Similarity between authority part of HITS and PageRank
        # Cosine criterion
        simCosHP = simCosPH
        # Spearman rank order correlation coefficient criterion
        spHP = spPH
        # Kendall's Tau rank order correlation coefficient criterion
        ktHP = ktPH

        # Similarity between authority part of HITS and authority part of modified HITS
        # Cosine criterion
        num = sum(authvector*myauthvector)
        den = sqrt(sum(authvector*authvector))*sqrt(sum(myauthvector*myauthvector))
        simCosHmH = float(num)/den
        # Spearman rank order correlation coefficient criterion
        spHmH = stats.spearmanr(authvector, myauthvector)[0]
        # Kendall's Tau rank order correlation coefficient criterion
        ktHmH = stats.kendalltau(authvector, myauthvector)[0]

        # Similarity between authority part of HITS and Inboundlink
        # Cosine criterion
        num = sum(authvector*inboundlink)
        den = sqrt(sum(authvector*authvector))*sqrt(sum(inboundlink*inboundlink))
        simCosHI = float(num)/den
        # Spearman rank order correlation coefficient criterion 
        spHI = stats.spearmanr(authvector, inboundlink)[0]
        # Kendall's Tau rank order correlation coefficient criterion    
        ktHI = stats.kendalltau(authvector, inboundlink)[0]

        ###################################

        ###################################     
        # Similarity between authority part of modified HITS and PageRank
        # Cosine criterion
        simCosmHP = simCosPmH
        # Spearman rank order correlation coefficient criterion
        spmHP = spPmH
        # Kendall's Tau rank order correlation coefficient criterion
        ktmHP = ktPmH

        # Similarity between authority part of modified HITS and authority part of HITS
        # Cosine criterion
        simCosmHH = simCosHmH
        # Spearman rank order correlation coefficient criterion
        spmHH = spHmH
        # Kendall's Tau rank order correlation coefficient criterion
        ktmHH = ktHmH

        # Similarity between authority part of modified HITS and Inboundlink
        # Cosine criterion
        num = sum(myauthvector*inboundlink)
        den = sqrt(sum(myauthvector*myauthvector))*sqrt(sum(inboundlink*inboundlink))
        simCosmHI = float(num)/den
        # Spearman rank order correlation coefficient criterion 
        spmHI = stats.spearmanr(myauthvector, inboundlink)[0]
        # Kendall's Tau rank order correlation coefficient criterion    
        ktmHI = stats.kendalltau(myauthvector, inboundlink)[0]

        ###################################

        ###################################     
        # Similarity between Inboundlink and PageRank
        # Cosine criterion
        simCosIP = simCosPI
        # Spearman rank order correlation coefficient criterion
        spIP = spPI
        # Kendall's Tau rank order correlation coefficient criterion
        ktIP = ktPI

        # Similarity between Inboundlink and authority part of HITS
        # Cosine criterion
        simCosIH = simCosHI
        # Spearman rank order correlation coefficient criterion
        spIH = spHI
        # Kendall's Tau rank order correlation coefficient criterion
        ktIH = ktHI

        # Similarity between Inboundlink and authority part of modified HITS
        # Cosine criterion
        simCosImH = simCosmHI
        # Spearman rank order correlation coefficient criterion 
        spImH = spmHI
        # Kendall's Tau rank order correlation coefficient criterion    
        ktImH = ktmHI

        ###################################

        print '-----------------------------------------------'
        print 'nodeNum = %i, nonZero = %i'%(nodeNum, nonZero)
        print '-----------------------------------------------'
        print 'SIMILARITY MEASURE, COSINE CRITERION'
        print 'STANDARD MEASURE: PAGERANK'
        print 'simCosPH = %f, simCosPmH = %f, simCosPI = %f' %(simCosPH, simCosPmH, simCosPI)
        print ''
        print 'STANDARD MEASURE: HITS'
        print 'simCosHP = %f, simCosHmH = %f, simCosHI = %f' %(simCosHP, simCosHmH, simCosHI)
        print ''
        print 'STANDARD MEASURE: modified HITS'
        print 'simCosmHP = %f, simCosmHH = %f, simCosmHI = %f' %(simCosmHP, simCosmHH, simCosmHI)
        print ''
        print 'STANDARD MEASURE: Inboundlink'
        print 'simCosIP = %f, simCosIH = %f, simCosImH = %f' %(simCosIP, simCosIH, simCosImH)
        print '----------------------------------------------'
        print 'SIMILARITY MEASURE, SPEARMAN CRITERION'
        print 'STANDARD MEASURE: PAGERANK'
        print 'spPH = %f, spPmH = %f, spPI = %f' %(spPH, spPmH, spPI)
        print ''
        print 'STANDARD MEASURE: HITS'
        print 'spHP = %f, spHmH = %f, spHI = %f' %(spHP, spHmH, spHI)
        print ''
        print 'STANDARD MEASURE: modified HITS'
        print 'spmHP = %f, spmHH = %f, spmHI = %f' %(spmHP, spmHH, spmHI)
        print ''
        print 'STANDARD MEASURE: Inboundlink'
        print 'spIP = %f, spIH = %f, spImH = %f' %(spIP, spIH, spImH)
        print '----------------------------------------------'
        print "SIMILARITY MEASURE, KENDALL'S TAU CRITERION"
        print 'STANDARD MEASURE: PAGERANK'
        print 'ktPH = %f, ktPmH = %f, ktPI = %f' %(ktPH, ktPmH, ktPI)
        print ''
        print 'STANDARD MEASURE: HITS'
        print 'ktHP = %f, ktHmH = %f, ktHI = %f' %(ktHP, ktHmH, ktHI)
        print ''
        print 'STANDARD MEASURE: modified HITS'
        print 'ktmHP = %f, ktmHH = %f, ktmHI = %f' %(ktmHP, ktmHH, ktmHI)
        print ''
        print 'STANDARD MEASURE: Inboundlink'
        print 'ktIP = %f, ktIH = %f, ktImH = %f' %(ktIP, ktIH, ktImH)

        f = open('Similarity.txt','w')
        f.write('-----------------------------------------------\n')
        f.write('nodeNum = %i, nonZero = %i\n'%(nodeNum, nonZero))
        f.write('-----------------------------------------------\n')
        f.write('SIMILARITY MEASURE, COSINE CRITERION\n')
        f.write('STANDARD MEASURE: PAGERANK\n')
        f.write('simCosPH = %f, simCosPmH = %f, simCosPI = %f\n' %(simCosPH, simCosPmH, simCosPI))
        f.write(' \n')
        f.write('STANDARD MEASURE: HITS\n')
        f.write('simCosHP = %f, simCosHmH = %f, simCosHI = %f\n' %(simCosHP, simCosHmH, simCosHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: modified HITS\n')
        f.write('simCosmHP = %f, simCosmHH = %f, simCosmHI = %f\n' %(simCosmHP, simCosmHH, simCosmHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: Inboundlink\n')
        f.write('simCosIP = %f, simCosIH = %f, simCosImH = %f\n' %(simCosIP, simCosIH, simCosImH))
        f.write('----------------------------------------------\n')
        f.write('SIMILARITY MEASURE, SPEARMAN CRITERION\n')
        f.write('STANDARD MEASURE: PAGERANK\n')
        f.write('spPH = %f, spPmH = %f, spPI = %f\n' %(spPH, spPmH, spPI))
        f.write(' \n')
        f.write('STANDARD MEASURE: HITS\n')
        f.write('spHP = %f, spHmH = %f, spHI = %f\n' %(spHP, spHmH, spHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: modified HITS\n')
        f.write('spmHP = %f, spmHH = %f, spmHI = %f\n' %(spmHP, spmHH, spmHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: Inboundlink\n')
        f.write('spIP = %f, spIH = %f, spImH = %f\n' %(spIP, spIH, spImH))
        f.write('----------------------------------------------\n')
        f.write("SIMILARITY MEASURE, KENDALL'S TAU CRITERION\n")
        f.write('STANDARD MEASURE: PAGERANK\n')
        f.write('ktPH = %f, ktPmH = %f, ktPI = %f\n' %(ktPH, ktPmH, ktPI))
        f.write(' \n')
        f.write('STANDARD MEASURE: HITS\n')
        f.write('ktHP = %f, ktHmH = %f, ktHI = %f\n' %(ktHP, ktHmH, ktHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: modified HITS\n')
        f.write('ktmHP = %f, ktmHH = %f, ktmHI = %f\n' %(ktmHP, ktmHH, ktmHI))
        f.write(' \n')
        f.write('STANDARD MEASURE: Inboundlink\n')
        f.write('ktIP = %f, ktIH = %f, ktImH = %f\n' %(ktIP, ktIH, ktImH))
        f.close()

    def backbutton(self):
        word = 'backbutton'
        wordid = self.getentryid('wordlist', 'word', word)
        for (urlid,) in self.con.execute('select rowid from urllist'):
            if self.con.execute('select toid from link where fromid = %i' %urlid).fetchone() == None:
                for (backlinker,) in self.con.execute('select fromid from link where toid = %i' %urlid):
                    if backlinker == urlid: continue
                    cur = self.con.execute("insert into link(fromid, toid) values(%i, %i)" %(urlid, backlinker))
                    linkid = cur.lastrowid
                    self.con.execute("insert into linkwords(linkid, wordid) values(%i, %i)" %(linkid, wordid))
            else: continue
        self.dbcommit()

    def danglingnode(self):
        t = 1
        # Clear out danglingnode tables
        self.con.execute('drop table if exists dnode')
        # Create danglingnode table
        self.con.execute('create table dnode(id integer primary key, val integer)')
        # Initialize every item with False value
        self.con.execute('insert into dnode select rowid, 0 from urllist')
        self.dbcommit()
        # Find dangling nodes
        for (urlid,) in self.con.execute('select rowid from urllist'):
            if self.con.execute('select toid from link where fromid = %i' %urlid).fetchone() == None:
                self.con.execute('update dnode set val = %i where id=%i'%(t,urlid))
        self.dbcommit()


### CLASS CRAWLER ENDS HERE###





#QUERY INDEPENDENT SCORES (LINK SCORES) END HERE#

import pythinsearch14
page = "data"
c=pythinsearch14.crawler('')
c.crawl(page)
#searcher=pythinsearch14.searcher('wikipedia.db')
#wordids,urls=searcher.query('python','qi')

Your scraping is getting text that is not ascii, and you're trying to treat is as ascii. That's why you're getting that error. Here and here is a more Python-centric version.

Can you suggest solution? I am getting confuse regarding russian and latin-1 encode. I have passed 2 weeks for this and read so many things.

Zohaib: I hate the unicode / ascii conversion process in Python.

You need to make a fundamental choice: either write a function that converts unicode to ascii, silently dropping all 'complicated' characters

or

You need to rewrite your code so it's unicode throughout

The first option is ugly, and wrong, and easiest.

ok i am trying....