Требуется помощь в отладке веб-искателя Python

Я не могу запустить поисковый робот (named searchengine.py ), несмотря на мои лучшие усилия за последние пару часов. Кажется, он не смог успешно индексировать страницы по мере их появления. Я дам вам полный код искателя. Вид ошибок, которые я получаю, выглядит ниже

 Indexing http://www.4futureengineers.com/company.html Could not parse page http://www.4futureengineers.com/company.html 

Я searchengine.py , введя следующие команды в своем интерактивном сеансе Python (shell).

 >> import searchengine >> crawler=searchengine.crawler('searchindex.db') >> pages= \ .. ['http://www.4futureengineers.com/company.html'] >> crawler.crawl(pages) 

Это дает ошибки, т. crawler.crawl(pages) Неудачный синтаксический анализ сразу после команды crawler.crawl(pages)

Вот полный исходный код searchengine.py

 import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() значение import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новых import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новые import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() новые import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] # Index an individual page def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing '+url # Get the individual words text=self.gettextonly(soup) words=self.separatewords(text) # Get the URL id urlid=self.getentryid('urllist','url',url) # Link each word to this url for i in range(len(words)): word=words[i] if word in ignorewords: continue wordid=self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Extract the text from an HTML page (no tags) def gettextonly(self,soup): v=soup.string if v==Null: c=soup.contents resulttext='' for t in c: subtext=self.gettextonly(t) resulttext+=subtext+'\n' return resulttext else: return v.strip() # Seperate the words by any non-whitespace character def separatewords(self,text): splitter=re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def isindexed(self,url): u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: #Check if it has actually been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def crawl(self,pages,depth=2): for i in range(depth): newpages={} for page in pages: try: c=urllib2.urlopen(page) except: print "Could not open %s" % page continue try: soup=BeautifulSoup(c.read()) self.addtoindex(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http' and not self.isindexed(url): newpages[url]=1 linkText=self.gettextonly(link) self.addlinkref(page,url,linkText) self.dbcommit() except: print "Could not parse page %s" % page pages=newpages # Create the database tables def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() 

One Solution collect form web for “Требуется помощь в отладке веб-искателя Python”

Обработка ошибок при crawl значительно затруднила отладку:

 try: # too much stuff here except: # bare except print "Could not parse page %s" % page # generic message 

Хотя он очень стабилен (т. Е. Если что-то пойдет не так, программа продолжает работать), это делает невозможным выяснить, что происходит не так, все, что вы знаете, состоит в том, что одна из тринадцати строк в блоке try пошла не так. Рефакторинг этого раздела кода с короткими блоками try и проверка конкретных ошибок (см. «Зло исключений» ).

Вы можете попробовать запустить без какой-либо обработки ошибок вообще (закомментируйте строки try: except: and print ... и выделите строки, находящиеся в настоящее время в блоке try ) и прочитайте конкретные трассировки ошибок, которые помогут вам, а затем установите соответствующую обработку ошибок назад позже.

  • Scrapy - доступ к данным при обходе и случайном изменении пользовательского агента
  • Начало Scrapy Сканирование после входа в систему
  • Scrapy - выберите конкретную ссылку на основе текста
  • Scrapy - Сканирование и очистка веб-сайта
  • Как очистить XML-ленту с помощью xmlfeedspider
  • Каковы наилучшие готовые библиотеки для выполнения веб-сканирования в Python
  • Сохранение URL-адресов во время Spidering
  • Сделать Scrapy следовать ссылкам и собирать данные
  • Может ли Scrapy заменить на pyspider?
  • python: Существующее соединение было принудительно закрыто удаленным хостом
  • Scrapy Deploy не соответствует результату отладки
  • Python - лучший язык программирования в мире.