объекты для очистки не являются сериализуемыми JSON, сохраняя их на couchdb

items.py classes import scrapy from scrapy.item import Item, Field import json class Attributes(scrapy.Item): description = Field() pages=Field() author=Field() class Vendor(scrapy.Item): title=Field() order_url=Field() class bookItem(scrapy.Item): title = Field() url = Field() marketprice=Field() images=Field() price=Field() attributes=Field() vendor=Field() time_scraped=Field() 

мой скребок

 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item from scrapy.spider import BaseSpider from scrapy import log from scrapper.items import bookItem,Attributes,Vendor import couchdb import logging import json import time from couchdb import Server class libertySpider(CrawlSpider): couch = couchdb.Server() db = couch['python-tests'] name = "libertybooks" allowed_domains = ["libertybooks.com"] unvisited_urls = [] visited_urls = [] start_urls = [ "http://www.libertybooks.com" ] url=["http://www.kaymu.pk"] rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)] total=0 productpages=0 exceptionnum=0 def parse_item(self,response): if response.url.find("pid")!=-1: with open("number.html","w") as w: self.total=self.total+1 w.write(str(self.total)+","+str(self.productpages)) itm=bookItem() attrib=Attributes() ven=Vendor() images=[] try: name=response.xpath('//span[@id="pagecontent_lblbookName"]/text()').extract()[0] name=name.encode('utf-8') except: name="name not found" try: price=response.xpath('//span[@id="pagecontent_lblPrice"]/text()').extract()[0] price=price.encode('utf-8') except: price=-1 try: marketprice=response.xpath('//span[@id="pagecontent_lblmarketprice"]/text()').extract()[0] marketprice=marketprice.encode('utf-8') except: marketprice=-1 try: pages=response.xpath('//span[@id="pagecontent_spanpages"]/text()').extract()[0] pages=pages.encode('utf-8') except: pages=-1 try: author=response.xpath('//span[@id="pagecontent_lblAuthor"]/text()').extract()[0] author=author.encode('utf-8') except: author="author not found" try: description=response.xpath('//span[@id="pagecontent_lblbookdetail"]/text()').extract()[0] description=description.encode('utf-8') except: description="des: not found" try: image=response.xpath('//img[@id="pagecontent_imgProduct"]/@src').extract()[0] image=image.encode('utf-8') except: image="#" ven['title']='libertybooks' ven['order_url']=response.url itm['vendor']=ven itm['time_scraped']=time.ctime() itm['title']=name itm['url']=response.url itm['price']=price itm['marketprice']=marketprice itm['images']=images attrib['pages']=pages attrib['author']=author attrib['description']=description itm['attributes']=attrib self.saveindb(itm) return itm def saveindb(self,obj): logging.debug(obj) self.db.save(obj) 


Трассировки стека

 2014-12-09 13:57:37-0800 [libertybooks] ERROR: Spider error processing <GET http://www.libertybooks.com/bookdetail.aspx?pid=16532> Traceback (most recent call last): File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent call.func(*call.args, **call.kw) File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick taskObj._oneWorkUnit() File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit result = next(self._iterator) File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr> work = (callable(elem, *args, **named) for elem in iterable) --- <exception caught here> --- File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback yield next(it) File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output for x in result: File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr> return (_set_referer(r) for r in result or ()) File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr> return (r for r in result or () if _filter(r)) File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr> return (r for r in result or () if _filter(r)) File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response cb_res = callback(response, **cb_kwargs) or () File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 107, in parse_item self.saveindb(itm) File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 112, in saveindb self.db.save(obj) File "/usr/local/lib/python2.7/dist-packages/couchdb/client.py", line 431, in save _, _, data = func(body=doc, **options) File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 514, in post_json **params) File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 533, in _request_json headers=headers, **params) File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 529, in _request credentials=self.credentials) File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 244, in request body = json.encode(body).encode('utf-8') File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 69, in encode return _encode(obj) File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 135, in <lambda> dumps(obj, allow_nan=False, ensure_ascii=False) File "/usr/lib/python2.7/json/__init__.py", line 250, in dumps sort_keys=sort_keys, **kw).encode(obj) File "/usr/lib/python2.7/json/encoder.py", line 207, in encode chunks = self.iterencode(o, _one_shot=True) File "/usr/lib/python2.7/json/encoder.py", line 270, in iterencode return _iterencode(o, 0) File "/usr/lib/python2.7/json/encoder.py", line 184, in default raise TypeError(repr(o) + " is not JSON serializable") exceptions.TypeError: {'attributes': {'author': 'Tina Fey', 'description': "Once in a generation a woman comes along who changes everything. Tina Fey is not that woman, but she met that woman once and acted weird around her.\r\n\r\nBefore 30 Rock, Mean Girls and 'Sarah Palin', Tina Fey was just a young girl with a dream: a recurring stress dream that she was being chased through a local airport by her middle-school gym teacher.\r\n\r\nShe also had a dream that one day she would be a comedian on TV. She has seen both these dreams come true.\r\n\r\nAt last, Tina Fey's story can be told. From her youthful days as a vicious nerd to her tour of duty on Saturday Night Live; from her passionately halfhearted pursuit of physical beauty to her life as a mother eating things off the floor; from her one-sided college romance to her nearly fatal honeymoon - from the beginning of this paragraph to this final sentence.\r\n\r\nTina Fey reveals all, and proves what we've all suspected: you're no one until someone calls you bossy.", 'pages': '304 Pages'}, 'images': [], 'marketprice': '1,095', 'price': '986', 'time_scraped': 'Tue Dec 9 13:57:37 2014', 'title': 'Bossypants', 'url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532', 'vendor': {'order_url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532', 'title': 'libertybooks'}} is not JSON serializable 

Я новичок в scrapy и couchdb, я также попытался преобразовать объект item в объект json, используя «json.dumps (itm, default = lambda o: o. dict , sort_keys = True, indent = 4)» но получил тот же ответ, поэтому, пожалуйста, скажите мне, есть ли способ сделать мой класс json сериализуемым, чтобы они могли храниться в couchdb?

  • Как имитировать запрос xhr с помощью Scrapy при попытке обхода данных из веб-сайта на основе ajax?
  • Как я могу использовать разные конвейеры для разных пауков в одном проекте Scrapy
  • Как очистить веб-сайт с помощью защиты сукури
  • windows scrapyd-deploy не распознается
  • Объект «NoneType» не имеет атрибута «_app_data» в scrapy \ twisted \ openssl
  • Анализ HTML с помощью XPath, Python и Scrapy
  • Ошибка при создании проекта Scrapy через startproject
  • clang: error: неизвестный аргумент: '-mno-fused-madd'
  • One Solution collect form web for “объекты для очистки не являются сериализуемыми JSON, сохраняя их на couchdb”

    Ну, более короткий ответ – просто использовать ScrapyJSONEncoder :

     from scrapy.utils.serialize import ScrapyJSONEncoder _encoder = ScrapyJSONEncoder() ... def saveindb(self,obj): logging.debug(obj) self.db.save(_encoder.encode(obj)) 

    Более длинная версия: если вы намерены развивать этот паук (если это не должно быть одноразовым), вы можете использовать конвейер для хранения элементов в CouchDB и сохранения разнесенных проблем (обход / скребок в пауке код, сохраняющий в базе данных код конвейера).

    Сначала это может показаться чрезмерной инженерией, но это действительно помогает, когда проект начинает расти и упрощает тестирование.

    Python - лучший язык программирования в мире.