python - Scrapy return a updated dict item back -
im trying scrape details subsite. each item new subsite want retrieve additional information from. im storing information dict per item , per detail.
the problem duplicate items when yielding item dict items, , expect has me parising whole item in function instead of updated dict items. however, i have been successful in parsing updated dict item the item. instead new item of every updated dict item. expect problem either has for loop, or 'def parse_item_sub' function.
to give quick intro im doing in code;
- retrive id's database
- scrape site attributes item. because site contains several identical items, varying information. have created additional sub_items, scraped main item, dictionaries.
- loop on each dict item contains href, information site , scrape dict in dictionary. 'item['subsite_dic'][position]' thereafter return item is possible return updated dict item of item?
the code
from scrapy.spiders import spider scrapy.selector import selector scrapeinfo.items import infoitem import pyodbc class scrapeinfo(spider): name = "info" allowed_domains = ["http://www.nevermind.com"] start_urls = [] def start_requests(self): #get infoid , type database self.conn = pyodbc.connect('driver={sql server};server=server;database=dbname;uid=user;pwd=password') self.cursor = self.conn.cursor() self.cursor.execute("select infoid, category dbo.stageitem") rows = self.cursor.fetchall() row in rows: url = 'http://www.nevermind.com/info/' infoid = row[0] category = row[1] yield self.make_requests_from_url(url+infoid, infoid, category, self.parse) def make_requests_from_url(self, url, infoid, category, callback): request = request(url, callback) request.meta['infoid'] = infoid request.meta['category'] = category return request def parse(self, response): hxs = selector(response) infodata = hxs.xpath('div[2]/div[2]') # input item path itempool = [] infoid = response.meta['infoid'] category = response.meta['category'] info in infodata: item = infoitem() item1, item2, item3 = [infoitemsubsite() in range(3)] # stem details item['id'] = infoid item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract() item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract() item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract() item1['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract() item1['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract() item1['field6'] = info.xpath('tr[6]/td[2]/p/b/@href').extract() item2['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract() item2['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract() item2['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract() item3['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract() item3['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract() item3['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract() item['subsite_dic'] = [dict(item1), dict(item2), dict(item3)] sub_href = [d['field6'] d in item['subsite_dic']] # below code should send each dictionary item further information # think might here have problem position = 0 in sub_href: if i: url = 'http://www.nevermind.com/info/'+''.join(i[0]) yield request(url, self.parse_item_sub, meta={'item':item,'position':position}) print i[0], position, url position = position +1 else: position = position +1 itempool.append(item) yield item pass # function extract additional info subsite, , return original item. # works , right information extracted def parse_item_sub(self, response, category): hxs = selector(response) position = response.meta['position'] item = response.meta['item'] subsite = item['subsite_dic'][position] item['subsite_field11'] = i.xpath('/td[2]/span/@title').extract() item['subsite_field12'] = i.xpath('/tr[4]/td[2]/text()').extract() item['subsite_field13'] = i.xpath('/div[5]/a[1]/@href').extract() response.meta['item'] = item yield item pass
Comments
Post a Comment