【实战】亚马逊产品详情页nodejs爬虫 发表于 2018-09-14 | 分类于 Node.js 配置文件123456789101112131415161718// package.json{ "name": "2", "version": "1.0.0", "description": "", "main": "index_ali.js", "dependencies": { "exceljs": "^1.0.0", "iconv-lite": "^0.4.19", "jsdom": "^11.6.2" }, "devDependencies": {}, "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", "license": "ISC"} 代码123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111// index_ali.jsconst https = require('https')const fs = require('fs')const iconv = require('iconv-lite')const jsdom = require('jsdom')const { JSDOM } = jsdomvar items = new Array()var getInput = () => { var datas = fs.readFileSync('./input-ali.txt') const dom = new JSDOM(datas.toString()) var lines = dom.window.document.querySelectorAll('.transaction-feedback-table tbody .col-order') var percentnum = dom.window.document.querySelector('.percent-num').innerHTML var rantingsnum = dom.window.document.querySelector('.rantings-num').innerHTML var ordernum = dom.window.document.querySelector('.order-num').innerHTML var pprice = (dom.window.document.querySelector('#j-sku-discount-price').innerHTML + dom.window.document.querySelector('.p-symbol').innerHTML).replace(/ /,'') var pdiscountrate = dom.window.document.querySelector('.p-discount-rate').innerHTML var wishlistnum = dom.window.document.querySelector('.wishlist-num').innerHTML for (var line of lines) { var num = (line.getElementsByClassName('order-num')[0] === undefined ? '0' : line.getElementsByClassName('order-num')[0].innerHTML) var time = (line.getElementsByClassName('order-time')[0] === undefined ? '0' : line.getElementsByClassName('order-time')[0].innerHTML) items.push({ num: num, time: time, percentnum: percentnum, rantingsnum: rantingsnum, ordernum: ordernum, pprice: pprice, pdiscountrate: pdiscountrate, wishlistnum: wishlistnum }) }}var setOutput = () => { fs.writeFile('./Electric_pressure_cooker/mid_output_ali.txt', JSON.stringify(items), err => console.log(err))}var prepare = str => { if (str === undefined) { return null } else { // 这里的/,/g和/\n/g是正则表达式 return str.replace(/,/g,',').replace(/\n/g, ' ') }}var toExcel = () => { var cnt = 1 // 去重复 function unique(songs){ let result = {}; let finalResult=[]; for(let i=0;i<songs.length;i++){ result[songs[i].time]=songs[i]; } for(item in result){ finalResult.push(result[item]); } return finalResult; } items = unique(items) function filter(items,key){ for (var i = 0; i < items.length; i++) { if (i == 0) { continue }else{ delete items[i][key] } } } filter(items,"percentnum") filter(items,"rantingsnum") filter(items,"ordernum") filter(items,"pprice") filter(items,"pdiscountrate") filter(items,"wishlistnum") for (var data of items) { var str = '' str += cnt++ str += ',' + prepare(data['num']) str += ',' + prepare(data['time']) str += ',' + (prepare(data['percentnum']) === null ? '': prepare(data['percentnum'])) str += ',' + (prepare(data['rantingsnum']) === null ? '': prepare(data['rantingsnum'])) str += ',' + (prepare(data['ordernum']) === null ? '': prepare(data['ordernum'])) str += ',' + (prepare(data['pprice']) === null ? '': prepare(data['pprice'])) str += ',' + (prepare(data['pdiscountrate']) === null ? '': prepare(data['pdiscountrate'])) str += ',' + (prepare(data['wishlistnum']) === null ? '': prepare(data['wishlistnum'])) str += '\n' // 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾 fs.writeFileSync('./output-ali.csv', str, {flag: 'a'}, err => console.log(err)) }}getInput()//console.log(items[0])//setOutput()toExcel() 使用说明1.打开亚马逊产品列表 2.按F12,选择列表div,右键copy->copy element 3.打开input.txt,将数据删除后粘贴新的数据 4.运行node index_ali.js 生成csv格式文件