Требуется помощь в отладке веб-искателя Python

Я не могу запустить поисковый робот (named searchengine.py ), несмотря на мои лучшие усилия за последние пару часов. Кажется, он не смог успешно индексировать страницы по мере их появления. Я дам вам полный код искателя. Вид ошибок, которые я получаю, выглядит ниже

 Indexing http://www.4futureengineers.com/company.html Could not parse page http://www.4futureengineers.com/company.html 

Я searchengine.py , введя следующие команды в своем интерактивном сеансе Python (shell).

 >> import searchengine >> crawler=searchengine.crawler('searchindex.db') >> pages= \ .. ['http://www.4futureengineers.com/company.html'] >> crawler.crawl(pages) 

Это дает ошибки, т. crawler.crawl(pages) Неудачный синтаксический анализ сразу после команды crawler.crawl(pages)

Вот полный исходный код searchengine.py

 import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() значение import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новых import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новые import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новые import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() 

One Solution collect form web for “Требуется помощь в отладке веб-искателя Python”

Обработка ошибок при crawl значительно затруднила отладку:

 try: # too much stuff here except: # bare except print "Could not parse page %s" % page # generic message 

Хотя он очень стабилен (т. Е. Если что-то пойдет не так, программа продолжает работать), это делает невозможным выяснить, что происходит не так, все, что вы знаете, состоит в том, что одна из тринадцати строк в блоке try пошла не так. Рефакторинг этого раздела кода с короткими блоками try и проверка конкретных ошибок (см. «Зло исключений» ).

Вы можете попробовать запустить без какой-либо обработки ошибок вообще (закомментируйте строки try: except: and print ... и выделите строки, находящиеся в настоящее время в блоке try ) и прочитайте конкретные трассировки ошибок, которые помогут вам, а затем установите соответствующую обработку ошибок назад позже.

Interesting Posts

склеарная агломерационная кластерная матрица связывания

Как игнорировать теги при получении .string элемента Beautiful Soup?

Есть ли способ установить параметр по умолчанию, равный другому значению параметра?

Django: timezone.now vs timezone.now ()

Различают nginx, haproxy, лак и uWSGI / Gunicorn

Несколько уровней «collection.defaultdict» в Python

Как я должен учитывать накладные расходы subprocess.Popen () при выборе времени в python?

Как получить время UTC «полуночи» для данного часового пояса?

Получение уравнения сплайна из объекта UnivariateSpline

Подсчитайте количество строк в соединении «многие-ко-многим» (SQLAlchemy)

Возможность подсчета для одиночных кроссоверов

Импорт модуля python SWIG завершается с ошибкой apache

Как вставить ссылки в Python

Вставить несколько текстовых файлов с разделителями табуляции в MySQL с помощью Python?

Проблемы при кодировании, декодирование арабского языка в терминале

Python - лучший язык программирования в мире.