「前端基础」写了个淘宝商品信息及评价采集爬虫脚本,分享下源码

var configs = {
    domains: ["s.taobao.com", "item.taobao.com"],
    scanUrls: ["https://s.taobao.com/list?q=%E5%A5%97%E8%A3%85%E5%A5%B3%E5%A4%8F&s=0"],
    contentUrlRegexes: ["https?://item\\.taobao\\.com/item\\.htm\\?.*"],
    helperUrlRegexes: ["https?://s\\.taobao\\.com/list\\?q=%E5%A5%97%E8%A3%85%E5%A5%B3%E5%A4%8F&s=\\d+"],
    interval: 10000,
    fields: [
        {
            // 第一个抽取项
            name: "title",
            selector: "//h3[contains(@class,'tb-main-title')]/@data-title",
            required: true
        },
        {
            // 第二个抽取项
            name: "price",
            selector: "//em[@id='J_PromoPriceNum'] | //em[contains(@class,'tb-rmb-num')]",
            required: true
        },
        {
            // 第三个抽取项
            name: "thumb",
            selector: "//*[@id='J_ImgBooth']"
        }
    ]
};

var totalPageNum = 0;

configs.onProcessScanPage = function(page, content, site) {
    if (content === null) return false;
    var regex = /g_page_config\s*=\s*(\{.*\})?;\s*\n*\s*g_srp_loadCss\(\)\;/;
    var data = regex.exec(page.raw);
    var jsonData;
    if (data !== null && data.length > 1 && typeof(data[1]) === "string") {
        jsonData = JSON.parse(data[1]);
        if (totalPageNum === 0) {
            totalPageNum = parseInt(jsonData.mods.sortbar.data.pager.totalPage);
        }
        var items = jsonData.mods.itemlist.data.auctions;
        for (var i = 0, n = items.length; i < n; i++) {
            site.addUrl("https:" + items[i].detail_url);
        }
    }

    var currentPageNum = parseInt(jsonData.mods.sortbar.data.pager.currentPage);
    if (currentPageNum >= totalPageNum) {
        totalPageNum = 0;
        return false;
    }

    var currentStart = parseInt(page.url.substring(page.url.indexOf("&s=") + 3));
    var start = currentStart + 60;
    var nextUrl = page.url.replace("&s=" + currentStart, "&s=" + start);
    site.addUrl(nextUrl);
    return false;
};

configs.onProcessHelperPage = function(page, content, site) {
    if (content === null) return false;
    var regex = /g_page_config\s*=\s*(\{.*\})?;\s*\n*\s*g_srp_loadCss\(\)\;/;
    var data = regex.exec(page.raw);
    var jsonData;
    if (data !== null && data.length > 1 && typeof(data[1]) === "string") {
        jsonData = JSON.parse(data[1]);
        var items = jsonData.mods.itemlist.data.auctions;
        for (var i = 0, n = items.length; i < n; i++) {
            site.addUrl("https:" + items[i].detail_url);
        }
    }

    var currentPageNum = parseInt(jsonData.mods.sortbar.data.pager.currentPage);
    if (currentPageNum >= totalPageNum) {
        totalPageNum = 0;
        return false;
    }

    var currentStart = parseInt(page.url.substring(page.url.indexOf("&s=") + 3));
    var start = currentStart + 60;
    var nextUrl = page.url.replace("&s=" + currentStart, "&s=" + start);
    site.addUrl(nextUrl);
    return false;
};

var crawler = new Crawler(configs);
crawler.start();

爬虫脚本如何编写点这里

更多源码分享点这里

代码如何运行点这里

2个月前
回答
暂无回答
我来回答
无用回答
问题修改记录
暂无修改记录
广告位 点击查看投放指南

我的收藏