ZhangYang's Blog

【实战】亚马逊产品详情页nodejs爬虫

配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// package.json
{
"name": "2",
"version": "1.0.0",
"description": "",
"main": "index_ali.js",
"dependencies": {
"exceljs": "^1.0.0",
"iconv-lite": "^0.4.19",
"jsdom": "^11.6.2"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC"
}

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// index_ali.js
const https = require('https')
const fs = require('fs')
const iconv = require('iconv-lite')
const jsdom = require('jsdom')
const { JSDOM } = jsdom
var items = new Array()
var getInput = () => {
var datas = fs.readFileSync('./input-ali.txt')
const dom = new JSDOM(datas.toString())
var lines = dom.window.document.querySelectorAll('.transaction-feedback-table tbody .col-order')
var percentnum = dom.window.document.querySelector('.percent-num').innerHTML
var rantingsnum = dom.window.document.querySelector('.rantings-num').innerHTML
var ordernum = dom.window.document.querySelector('.order-num').innerHTML
var pprice = (dom.window.document.querySelector('#j-sku-discount-price').innerHTML + dom.window.document.querySelector('.p-symbol').innerHTML).replace(/ /,'')
var pdiscountrate = dom.window.document.querySelector('.p-discount-rate').innerHTML
var wishlistnum = dom.window.document.querySelector('.wishlist-num').innerHTML
for (var line of lines) {
var num = (line.getElementsByClassName('order-num')[0] === undefined ? '0' : line.getElementsByClassName('order-num')[0].innerHTML)
var time = (line.getElementsByClassName('order-time')[0] === undefined ? '0' : line.getElementsByClassName('order-time')[0].innerHTML)
items.push({
num: num,
time: time,
percentnum: percentnum,
rantingsnum: rantingsnum,
ordernum: ordernum,
pprice: pprice,
pdiscountrate: pdiscountrate,
wishlistnum: wishlistnum
})
}
}
var setOutput = () => {
fs.writeFile('./Electric_pressure_cooker/mid_output_ali.txt', JSON.stringify(items), err => console.log(err))
}
var prepare = str => {
if (str === undefined) {
return null
}
else {
// 这里的/,/g和/\n/g是正则表达式
return str.replace(/,/g,',').replace(/\n/g, ' ')
}
}
var toExcel = () => {
var cnt = 1
// 去重复
function unique(songs){
let result = {};
let finalResult=[];
for(let i=0;i<songs.length;i++){
result[songs[i].time]=songs[i];
}
for(item in result){
finalResult.push(result[item]);
}
return finalResult;
}
items = unique(items)
function filter(items,key){
for (var i = 0; i < items.length; i++) {
if (i == 0) {
continue
}else{
delete items[i][key]
}
}
}
filter(items,"percentnum")
filter(items,"rantingsnum")
filter(items,"ordernum")
filter(items,"pprice")
filter(items,"pdiscountrate")
filter(items,"wishlistnum")
for (var data of items) {
var str = ''
str += cnt++
str += ',' + prepare(data['num'])
str += ',' + prepare(data['time'])
str += ',' + (prepare(data['percentnum']) === null ? '': prepare(data['percentnum']))
str += ',' + (prepare(data['rantingsnum']) === null ? '': prepare(data['rantingsnum']))
str += ',' + (prepare(data['ordernum']) === null ? '': prepare(data['ordernum']))
str += ',' + (prepare(data['pprice']) === null ? '': prepare(data['pprice']))
str += ',' + (prepare(data['pdiscountrate']) === null ? '': prepare(data['pdiscountrate']))
str += ',' + (prepare(data['wishlistnum']) === null ? '': prepare(data['wishlistnum']))
str += '\n'
// 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾
fs.writeFileSync('./output-ali.csv', str, {flag: 'a'}, err => console.log(err))
}
}
getInput()
//console.log(items[0])
//setOutput()
toExcel()

使用说明

1.打开亚马逊产品列表

2.按F12,选择列表div,右键copy->copy element

3.打开input.txt,将数据删除后粘贴新的数据

4.运行node index_ali.js 生成csv格式文件