-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathindex.js
71 lines (69 loc) · 2.15 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"use strict";
const YAML = require('yaml');
var fs = require('fs');
class Extractor {
constructor(filename){
const file = fs.readFileSync(filename, 'utf8');
this.selectors = YAML.parse(file);
}
extract_field(element, selector_type, attribute = '') {
let content;
if (!selector_type || selector_type === 'Text') {
content = element.textContent;
}
else if (selector_type === 'Link') {
content = element.href || '';
}
else if (selector_type === 'HTML') {
content = encodeURI((element.innerHTML));
}
else if (selector_type === 'Attribute') {
content = element.getAttribute(attribute);
}
return content
}
get_child(element, field_config) {
var child = {}
for (var field in field_config){
child[field] = this.extract_selector(element, field_config[field])
}
return child
}
extract_selector(root_element, field_config){
var elements;
if ('xpath' in field_config) {
elements = root_element.evaluate(field_config['xpath'])
}
else{
elements = root_element.querySelectorAll(field_config['css'])
}
var values = []
elements.forEach(element => {
var value;
if ('children' in field_config){
value = this.get_child(element, field_config['children'])
}
else{
var selector_type = field_config['type']
var attribute = field_config['attribute']
value = this.extract_field(element, selector_type, attribute)
}
values.push(value)
});
if (field_config['multiple'] === true){
return values;
}
else{
return values[0];
}
}
extract(document) {
var data = {};
for (var selector_name in this.selectors){
var value = this.extract_selector(document, this.selectors[selector_name])
data[selector_name] = value
}
return data;
}
}
exports.Extractor = Extractor;