Scrapy로 웹크롤링 코드

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
from time import sleep

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

year = 2013


class Yes24Spider(scrapy.Spider):
    name = 'yes24'
    start_urls = [
        f'http://www.yes24.com/24/category/bestseller?sumgb=09&year={year}&month={month}&FetchSize=80'
        for month in range(08, 08+1)


        #for year in years
    ]

    def parse(self, response):
        for item in response.css('.goodsTxtInfo p:first-child'):
            url = item.css('a:first-child::attr(href)').get()
            title = ''.join(item.css('::text').extract()).strip()
            print(url, title)
            yield response.follow(url, self.parse_detail)

    def parse_detail(self, response):
        sleep(1)
        title = response.css('.gd_name::text').get()
        if not title:
            return
        yield {
            'title': title.strip(),
            'title2': (response.css('.gd_nameE::text').get() or '').strip(),
            'rating': response.css('.gd_rating:first-child .yes_b::text').get(),
            'tags': ','.join(response.css('.tagArea .tag a::text').extract())
        }


if __name__ == '__main__':
    os.system('rm ./output/items_201308.csv')
    process = CrawlerProcess({
        **get_project_settings(),
        'FEED_FORMAT': 'CSV',
        'FEED_URI': 'output/items_201308.csv'
    })
    process.crawl(Yes24Spider)
    process.start()