ZhangYang's Blog

【实战】速卖通产品页列表nodejs爬虫

配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// package.json
{
"name": "1",
"version": "1.0.0",
"description": "",
"main": "index_ali.js",
"dependencies": {
"exceljs": "^1.0.0",
"iconv-lite": "^0.4.19",
"jsdom": "^11.6.2"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC"
}

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// index_ali.js
const https = require('https')
const fs = require('fs')
const iconv = require('iconv-lite')
const jsdom = require('jsdom')
const { JSDOM } = jsdom
var items = new Array()
var getInput = () => {
var datas = fs.readFileSync('./input-ali.txt')
const dom = new JSDOM(datas.toString())
var lines = dom.window.document.getElementsByClassName('_3liAhj _1R0K0g')
for (var line of lines) {
var info1 = line.getElementsByClassName('_2cLu-l')[0]
var href = info1.getAttribute('href')
var name = info1.getAttribute('title')
var price = line.getElementsByClassName('_1vC4OE')[0].innerHTML
var cnum = (line.getElementsByClassName('_38sUEc')[0] === undefined ? '0' : line.getElementsByClassName('_38sUEc')[0].innerHTML)
var onum = (line.getElementsByClassName('_2_KrJI')[0] === undefined ? '0' : line.getElementsByClassName('_2_KrJI')[0].innerHTML)
var info2 = line.getElementsByClassName('_1Nyybr _30XEf0')[0]
var img = (info2.getAttribute('src') == null ? info2.getAttribute('image-src') : info2.getAttribute('src'))
items.push({
href: href,
img: img,
name: name,
price: price,
cnum: cnum,
onum: onum
})
}
}
var setOutput = () => {
fs.writeFile('./Electric_pressure_cooker/mid_output_ali.txt', JSON.stringify(items), err => console.log(err))
}
var prepare = str => {
if (str === undefined) {
return null
}
else {
// 这里的/,/g和/\n/g是正则表达式
return str.replace(/,/g,',').replace(/\n/g, ' ')
}
}
var toExcel = () => {
var cnt = 1
for (var data of items) {
var str = ''
str += cnt++
str += ',' + prepare(data['href'])
str += ',' + prepare(data['img'])
str += ',' + prepare(data['name'])
str += ',' + prepare(data['price'])
str += ',' + prepare(data['cnum'])
str += ',' + prepare(data['onum'])
str += '\n'
// 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾
fs.writeFileSync('./output-ali.csv', str, {flag: 'a'}, err => console.log(err))
}
}
getInput()
//console.log(items[0])
//setOutput()
toExcel()

使用说明

1.打开速卖通产品列表

2.按F12,选择列表div,右键copy->copy element

3.打开input.txt,将数据删除后粘贴新的数据

4.运行node index_ali.js 生成csv格式文件