scrapy items for dynamic fields - python

the item field needs to change depending on which index the start_urls is set on
for example
location = input("Location:")
second_location = input("Second Location:")
start_urls = [
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
]
if self.start_urls[0]:
item['location'] = location
if self.start_urls[1]:
item['location'] = second_location
what happens is that item['location'] will be fixed and will not change dynamically, making all of the items output location the value of location, despite if its self.start_urls[1]
This is what i have so far.
items.py
class Item(scrapy.Item):
business_name = scrapy.Field()
website = scrapy.Field()
phonenumber = scrapy.Field()
email = scrapy.Field()
location = scrapy.Field()
# third_location = scrapy.Field()
# fourth_location = scrapy.Field()
visit_id = scrapy.Field()
visit_status = scrapy.Field()
myspider.py
search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second_Location:")
# city = [
# "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth",
# "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis", "Seattle", "St. Paul", "Nashville",
# "Louisville", "Plano"
# ]
# rancity = random.choice(city)
class YellowSpider(scrapy.Spider):
name = "yellow"
start_urls = [
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
]
def __init__(self):
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def parse(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse)
def businessprofile(self, response):
for business in response.css('header#main-header'):
item = Item()
item['business_name'] = business.css('div.sales-info h1::text').extract()
w = business.css('a.secondary-btn.website-link::attr(href)').extract()
item['website'] = str(w).strip('[]')
if self.start_urls[0]:
item['location'] = location
if self.start_urls[1]:
item['location'] = second_location
s = business.css('a.email-business::attr(href)').extract()
item['email'] = [item[7:] for item in s]
item['phonenumber'] = business.css('p.phone::text').extract_first()
for x in item['email']:
#new code here, call to self.seen_business_names
if x not in self.seen_emails:
if item['email']:
if item['phonenumber']:
if item['website']:
self.seen_emails.append(x)
yield item

Your code makes no sense.
if self.start_urls[0]:
item['location'] = location
if self.start_urls[1]:
item['location'] = second_location
As long as the elements of start_urls are not empty strings (or other falsy values), both of these blocks will be executed.
If I understand your problem correctly, you want to make item['location'] same as the location used in the starting url. The simplest way to do this would be to make your requests hold this information.
You should make custom requests in start_requests(), and use the method described in https://doc.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-callback-arguments to pass the location as request meta data.
After that, just pass it along to any consequent requests.

Related

Scrapy spider not stopping after page ends

I have written a spider in scrapy to scrape a website.everything is working fine except one thing.Once the spider has reached the last page it starts scraping from the last page back to first.
here is my code.
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "tutorial"
allowed_domain = ["jabong.com"]
start_urls = [
"http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
if products:
for product in products:
item = DmozItem()
item_url = product.xpath('#data-url').extract()
item_url = "http://www.jabong.com/" + item_url[0] if item_url else ''
if item_url:
request=Request(url=item_url,callback=self.parse_page2,meta={"item":item},
headers={"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
yield request
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=%s" % (self.page_index),
headers={"Referer": "http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/",
"X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
sizes=[]
item = response.meta['item']
item['site_name'] = 'jabong'
item['site_domain'] = 'http://www.jabong.com'
name = response.xpath('.//span[contains(#id, "qa-title-product")]/span/text()').extract()
item['name'] = name[0] if name else ''
brand = response.xpath('.//span[contains(#id, "qa-prd-brand")]/text()').extract()
item['brand'] = brand[0] if brand else ''
desc1 = response.xpath('.//div[contains(#id, "productInfo")]/p/text()').extract()
desc2 = response.xpath('.//div[contains(#id, "productInfo")]/p/span/text()').extract()
item['desc'] = desc1[0] if desc1 else desc2[0] if desc2 else ''
sku = response.xpath('//*[#id="qa-sku"]/text()').extract()
item['sku'] = sku[0] if sku else ''
item['age'] = 'adult'
gender = response.xpath('.//a[contains(#id, "qa-breadcrumb2")]/span/text()').extract()
item['gender'] = gender[0] if gender else ''
category = response.xpath('.//a[contains(#id, "qa-breadcrumb3")]/span/text()').extract()
item['category'] = category[0] if category else ''
sub_category = response.xpath('.//a[contains(#id, "qa-breadcrumb4")]/span/text()').extract()
item['sub_category'] = sub_category[0] if sub_category else ''
size = response.xpath('.//ul[contains(#id, "listProductSizes")]/li/text()').extract()
item['size'] = sizes
if size:
for s in size:
sizes.append(s.strip())
item['size'] = sizes
material = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Fabric Details")]/../td[2]/text()').extract()
if material:
item['material'] = material[0]
else:
material = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Fabric")]/../td[2]/text()').extract()
item['material'] = material[0] if material else ''
pattern = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Pattern")]/../td[2]/text()').extract()
item['pattern'] = pattern[0] if pattern else ''
color = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Color")]/../td[2]/text()').extract()
item['colors'] = color if color else ''
style = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Style")]/../td[2]/text()').extract()
item['style'] = style[0] if style else ''
images = response.xpath('.//div[contains(#class, "thumb-slider pos-abs")]/span/#data-image-big').extract()
item['images'] = images if images else ''
price1 = response.xpath('.//span[contains(#id, "before_price")]/span[2]/text()').extract()
item['price'] = {}
item['price']['mrp'] = price1[0].strip() if price1 else ''
item['price']['discount'] = ''
item['price']['sale'] = ''
care_tips = response.xpath('//*[#id="productInfo"]/table/tr/td[contains(text(),"Wash Care")]/../td[2]/text()').extract()
item['care_tips'] = care_tips[0] if care_tips else ''
item['url'] = response.url
item['tags'] = ''
yield item
Looks this website will redirect http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=* to http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/, expected? If so, I think this is a endless spider according to your code.

Building a tree with Scrapy

I am new to web scraping.
What I basically want to do is to scrape the following website: https://www.admin.ch/opc/fr/classified-compilation/national.html, which is referencing all the Swiss federal law.
The structure of the website is tree-like (you start with categories: Etat – Peuple – Autorités, Défense nationale, and so on; which contain sub-categories, until you reach the leaves- the legal texts) and I would like to get the same structure in a JSON file.
Here is the spider I have written for the moment:
import scrapy
from scrapy.spiders import XMLFeedSpider
from admin_ch.helpers.helper_functions import is_article_number, is_int, is_final_article
class Spider(XMLFeedSpider):
name = "spider_droit_interne"
allowed_domains = ["admin.ch"]
start_urls = [
'https://www.admin.ch/opc/fr/classified-compilation/national.html'
]
itertag = 'item'
def parse(self, response):
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract_first()
cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
url = category.xpath('td/a/#href').extract_first()
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_category, meta={'name': cat_name})
return #Just do the 1st category as an example
def parse_category(self, response):
name = response.meta['name']
tree = {}
node = {}
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract_first()
cat_name = cat_number + ' ' + category.xpath('td/a/text()').extract_first()
if is_int(cat_number) and len(cat_number) < 3:
url = category.xpath('td/a/#href').extract_first()
url = response.urljoin(url)
if is_final_article(url):
node[cat_name] = 'leaf reached!!!!!'
yield node
else:
#node[cat_name] = 'more nodes'
yield scrapy.Request(url, callback=self.get_article, meta={'name': cat_name})
tree[name] = node
yield tree
def get_article(self, response):
name = response.meta['name']
tree = {}
node = {}
for category in response.xpath('/html/body/div/div[2]/div/div[2]/table/tbody/tr'):
cat_number = category.xpath('td/text()').extract()[1]
for element in [' ', '\n', '\r']:
cat_number = cat_number.replace(element, '')
if is_article_number(cat_number):
url = category.xpath('td/a/#href').extract_first()
url = response.urljoin(url)
if is_final_article(url):
node[cat_number] = 'leaf reached!!!!!'
else:
node[cat_number] = cat_number
#yield scrapy.Request(url, callback=self.get_article, meta={'tree': item})
#The content is not important for the moment, only the structure
tree[name] = node
yield tree
The helper functions are only checking regular expressions, nothing really important in our case (because they do work):
import re
def is_article_number(string):
regex_article_number = '^[0-9]+[.[0-9]+]*$'
return re.match(regex_article_number, string)
def is_int(string):
regex_integer = '^[0-9]+'
return re.match(regex_integer, string)
def is_final_article(url):
return url.endswith('index.html')
The result I get from this is terribly wrong, could you help me out with this please?

scrapy.Request not working in a recursive callback

I'm doing a web scrapping with scrappy. It bisically goes to each sub-site and first check if the site is a pdf. If yes, it yield this pdf and process terminates. If it's a html site, it continue to check if there is any links inside this sub-site for only once. If there is, it calls the current method recursively to yield items in whatever format (html or pdf). However, the "yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)" line is not working. Can anyone point out what is wrong in my logic/code?
def parse_article(self, response):
item = response.meta
item['url'] = response.url
_, ext = os.path.splitext(urlparse(response.url).path)
is_binary_document = response.meta.pop('is_binary_document', False)
if is_binary_document: # binary files
item['html_content'] = None
item['content'] = response.body_as_unicode()
else: # html files
item['content'] = extract_text(response.xpath('//div[#id="content"]//text()'))
item['mime_type'] = 'text/html'
item['html_content'] = response.body_as_unicode()
category = response.meta.pop('category', None)
follow_links = response.meta.pop('follow_links', True)
if follow_links:
if '/research/' in item['url']:
for year_url in response.xpath('//select[#class="dropdownYear"]/option/#value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
else:
# Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
for a in response.xpath('//div[#id="content"]//a[#href]'):
href = a.xpath('#href').extract_first()
_, ext = os.path.splitext(href)
url = response.urljoin(href)
if category is None or '/{}/'.format(category) in url:
new_meta = response.meta.copy()
new_meta['follow_links'] = False # only follow for one level
link_text = extract_text(a.xpath('.//text()'))
yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
#end for
#end if
#end if
#end if
yield item
#end def
#end class
And here is the entire class code:
from __future__ import unicode_literals
import mimetypes
import os
import re
from urlparse import urlparse
from dateutil import parser as dateparser
import scrapy
from ..middlewares.binary_document import SUPPORTED_EXTENSIONS
from ..utils import extract_text
__author__ = 'Yuge Chen'
class HKMASpider(scrapy.Spider):
name = 'hkma'
jurisdiction = 'HK'
source_type = 'primary'
type = 'legislation/reg'
tier = 1
start_urls = [
'http://www.hkma.gov.hk/eng/'
]
URL_BLACKLIST = [
'http://apps.hkma.gov.hk/eng/index.php',
'http://vpr.hkma.gov.hk/cgi-bin/vpr/index.pl',
'https://www.cmu.org.hk/cmupbb_ws/eng/page/wmp0100/wmp010001.aspx',
'http://www.hkma.gov.hk/eng/other-information/photo-gallery/',
'http://www.hkimr.org/working_papers',
'http://www.hkimr.org/'
]
def parse(self, response):
for a in response.xpath('//div[#id="seo"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('#href').extract_first())
# if '/key-functions' in url:
# yield scrapy.Request(url, self.parse_key_functions, meta=dict(category='key-functions'))
if '/publications-and-research/quarterly-bulletin' in url:
yield scrapy.Request('http://www.hkma.gov.hk/eng/publications-and-research/quarterly-bulletin/', self.parse_publications_research)
break
# elif '/key-information' in url:
# yield scrapy.Request(url, self.parse_key_information)
# elif 'about-the-hkma' in url:
# pass
# else:
# yield scrapy.Request(url, self.parse_article, meta=dict(title=title))
#end for
#yield scrapy.Request('http://www.hkma.gov.hk/eng/key-information/guidelines-and-circulars/guidelines/', self.parse_article, meta=dict(title='Guidelines'))
#end def
def parse_key_information(self, response):
if response.xpath('//select[#class="dropdownYear"]'):
for year_url in response.xpath('//select[#class="dropdownYear"]/option/#value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_key_information_year)
#end for
#end if
for x in self.parse_key_information_year(response):
yield x
#end def
def parse_key_information_year(self, response):
for a in response.xpath('//*[#id="content"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('#href').extract_first())
try:
date_posted = dateparser.parse(extract_text(a.xpath('../../td[1]/text()')))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))
except ValueError: pass
#end for
#end def
def parse_publications_research(self, response):
for a in response.xpath('//*[#id="content"]//a'):
url = response.urljoin(a.xpath('#href').extract_first())
if ('/half-yearly-monetary' in url or '/quarterly-bulletin' in url) and '/research' not in response.url:
date_text = extract_text(a.xpath('.//text()')) + ' 1 ' + extract_text(a.xpath('../../td[1]/text()'))
date_posted = dateparser.parse(date_text)
title = None
if '/half-yearly-monetary' in url:
title = 'Hong Kong Monetary Authority Half-Yearly Monetary & Financial Stability Report - ' + date_text
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted, date_text=date_text))
else:
title = extract_text(a.xpath('.//text()'))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title))
def parse_key_functions(self, response):
for a in response.xpath('//*[#id="key-functionsLeftNav"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('#href').extract_first())
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, category='key-functions'))
#end for
#end def
def parse_research_year(self, response):
parent_url = response.url
print ("parent_url::::" + parent_url)
#print (extract_text(response.xpath('//table[#class="colorTable researchTable"]')) + '***')
for a in response.xpath('//div[#class="prContent"]//a[#href]'):
url = response.urljoin(a.xpath('#href').extract_first())
if not not response.xpath('//table[#class="colorTable researchTable"]'):
print ('++++++++')
print ('))))))' + extract_text(a.xpath('../../td[1]/text()')))
date_posted = extract_text(a.xpath('../../td[1]/text()'))
print ('))))))' + re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong'))))
title = re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong')))
elif not not response.xpath('//table[#class="formTable"]'):
print('____________')
print ('((((((' + url)
print ('((((((((' + extract_text(a.xpath('../../p[1]/text()')))
title = extract_text(a.xpath('../../p[1]/text()'))
print ('(((((((((' + extract_text(a.xpath('../text()[1]')))
date_posted = dateparser.parse(extract_text(a.xpath('../text()[1]')))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))
def parse_article(self, response):
print ('here????')
item = response.meta
item['url'] = response.url
_, ext = os.path.splitext(urlparse(response.url).path)
is_binary_document = response.meta.pop('is_binary_document', False)
print ('url!!!' + item['url'])
if is_binary_document: # binary files
print ('binary!!')
item['html_content'] = None
#item['content'] = response.body_as_unicode()
if '/quarterly-bulletin' in item['url']:
if item.get('date_text'):
item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['date_text'] + ' - ' + item['title']
else:
item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['title']
else: # html files
# item['content'] = extract_text(response.xpath('//div[#id="content"]//text()'))
item['mime_type'] = 'text/html'
# item['html_content'] = response.body_as_unicode()
if not item.get('date_posted'):
item['date_posted'] = dateparser.parse(extract_text(response.xpath("//*[#id='lastUpdate']/text()")), fuzzy=True)
category = response.meta.pop('category', None)
follow_links = response.meta.pop('follow_links', True)
if follow_links:
if '/research/' in item['url']:
for year_url in response.xpath('//select[#class="dropdownYear"]/option/#value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
else:
# Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
for a in response.xpath('//div[#id="content"]//a[#href]'):
href = a.xpath('#href').extract_first()
_, ext = os.path.splitext(href)
url = response.urljoin(href)
if category is None or '/{}/'.format(category) in url:
new_meta = response.meta.copy()
new_meta['follow_links'] = False # only follow for one level
link_text = extract_text(a.xpath('.//text()'))
if '/annual-report' in url:
new_meta['title'] = '{} {} - {}'.format('Hong Kong Monetary Authority', item['title'], link_text)
new_meta['date_posted'] = dateparser.parse('June 1 ' + item['title'][-4:])
elif item['title'] is not None:
new_meta['title'] = '{} - {}'.format(item['title'], link_text)
else:
new_meta['title'] = link_text
print ('url:######' + url)
print ('title:######' + new_meta['title'])
yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
#end for
#end if
#end if
#end if
yield item
#end def
end class

scrapy Unsupported URL scheme '': no handler available for that scheme

I find it can't run short_critic_content(self,response) why, I can't find reason.
I didn't use allowed_domains,
if I take it, the short_critic_content(self,response) is don't run.
allowed_domains = ["movie.mtime.com"].
start_urls = ['http://movie.mtime.com'] is wrong or right!
What's wrong with it getting error :
Scrapy Unsupported URL scheme '': no handler available for that scheme
class YinPin(CrawlSpider):
name = "yingping"
#allowed_domains = ["movie.mtime.com"]
start_urls = ['http://movie.mtime.com']
rules = (
#Rule(LinkExtractor(allow=())),
Rule(LinkExtractor(allow=(r'http://movie.mtime.com/40677/'), ), callback='movie_info', follow=False),
)
def movie_info(self, response):
selector = Selector(response)
#for movieinfo in movie_info:
movie_name = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/h1/text()').extract()
movie_url = response.url#movieinfo.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a[3]/#href').extract()
number = re.compile(r'\d+')
movie_num = int(number.search(str(movie_url)).group())
movie_release_time = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/p[1]/a/text()').extract()
movie_place = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/text()').extract()[3]
movie_type = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a/text()').extract()
movie_type_l = movie_type.pop()
movie_type = ' '.join(movie_type)
short_content = selector.css('#tweetRegion > dd > div > h3::text').extract() # selector.xpath('//*[#id="tweetRegion"]').css('h3::text').extract()
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
yield Request(short_url, callback=self.short_critic_content,
meta={ 'movie_num': movie_num,
'short_content': short_content})
item = YingpingItem(
movie_num = movie_num,
movie_name = movie_name,
movie_release_time = movie_release_time,
movie_place = movie_place,
movie_type = movie_type,
)
yield item
def short_critic_content(self, response):
selector = Selector(response)
movie_num = response.meta['movie_num']
short_contentft = response.meta['short_content']
short_contentsd = selector.css('#tweetRegion > dd > div > h3::text').extract()
short_contents = short_contentft +short_contentsd
item = shortcriticItem(
movie_num = movie_num,
movie_scritic = short_contents
)
yield item
It's almost certain the problem is in this line of your movie_info function:
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
extract() method of Selector returns a list, which you then convert to string. But that won't give you the URL, it gives you a string representation of the list, which starts with " character. That's why you get that error.
The correct way is either
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract()[0]
or even better to use extract_first() instead of extract()
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract_first()

how to extract certain string from URL

I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?

Resources