Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use axios instead of node-fetch #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 3 additions & 47 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,48 +1,4 @@
# Windows image file caches
Thumbs.db
ehthumbs.db

# Folder config file
Desktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msm
*.msp

# Windows shortcuts
*.lnk

# =========================
# Operating System Files
# =========================

# OSX
# =========================

.DS_Store
.AppleDouble
.LSOverride

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
.vscode/
node_modules/
config/default.json
tracks.json
2 changes: 2 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# discourse crawler
A simple script in order to fetch all the pages of topics in certain discourse category, and save it as a json file.
86 changes: 53 additions & 33 deletions app.js
Original file line number Diff line number Diff line change
@@ -1,63 +1,80 @@
const fetch = require('node-fetch');
const jsonfile = require('jsonfile')
const YAML = require('yamljs');
const github = require('octonode');
const config = require('config');
// import fetch from 'node-fetch';
import jsonfile from 'jsonfile';
import YAML from 'yamljs';
import axios from 'axios';
// const github = require('octonode');
// const config = require('config');
const delay = ms => new Promise(resolve => setTimeout(resolve, ms))

let token = config.get('token');
let client = github.client(token);
let ghrepo = client.repo('PDIS/web-jekyll');

let more_url = '';
// let token = config.get('token');
// let client = github.client(token);
// let ghrepo = client.repo('PDIS/web-jekyll');
let discourse_site_url = 'https://talk.pdis.nat.gov.tw';
let category_name = 'pdis-site/how-we-work-track';
let remove_posts = [73] // remove "definition" post
let file = './tracks.json';
let topics = [];
let posts = [];
let file = '/var/discourse/api/tracks.json'

let getIDs = async (more_url) => {
if (more_url == '') {
query = "http://talk.pdis.nat.gov.tw/c/pdis-site/how-we-work-track.json";
let getIDs = async (more_url = "") => {
let query = '';
if (more_url.includes("page")) {
query = discourse_site_url + more_url.replace(/\?page/, '.json?page');
} else {
query = "http://talk.pdis.nat.gov.tw" + more_url.replace(/\?page/, '.json?page');
query = `${discourse_site_url}/c/${category_name}.json`;
}
let response = await fetch(query);
let data = await response.json()
more_url = data.topic_list.more_topics_url || ''
let topics_tmp = data.topic_list.topics
// topics_tmp.splice(0,1) // remove first post (duplicated)
console.log(`fetching url: ${query}`); // print query url
let response = await axios.get(query)
let data = response.data
let topics_tmp = data.topic_list.topics;
topics_tmp.map(t => topics.push(t.id))
// check if there's more pages to fetch
more_url = data.topic_list.more_topics_url || '';
if (more_url != '') { // recursively getIDs
let ids = await getIDs(more_url);
await delay(500); // wait for next fetch
await getIDs(more_url);
}
}

let getPosts = async () => { // 取得單篇PO文
// * remove duplicated post
topics = topics.filter((topic, i) => topics.indexOf(topic) === i)
topics = topics.filter((topic, i) => topics.indexOf(topic) === i);
// * remove "definition" post
topics = topics.filter((topic, i) => topic != '73')
topics = topics.filter((topic) => remove_posts.indexOf(topic) == -1);
for (let id of topics) {
try {
let response = await fetch('http://talk.pdis.nat.gov.tw/t/' + id + ".json?include_raw=1")
let data = await response.json()
let response = await axios.get(`${discourse_site_url}/t/${id}.json?include_raw=1`);
let data = await response.data;
let post = {};
post['id'] = data['id']
post['id'] = data['id'];
post['title'] = data['title'];
post['date'] = await new Date(data['created_at'].toString()).toISOString().substring(0, 10);
post['date'] = new Date(data['created_at'].toString()).toISOString().substring(0, 10); // 2022-02-22
post['tags'] = data['tags'];
let raw = data['post_stream']['posts'][0]['raw'];
post['content'] = YAML.parse(raw)['content'];
posts.push(post);
console.log(`Post found: ${post.title}`); // print post content
await delay(500); // wait for next fetch
}
catch(e) {
console.error(e)
console.error(`getPost err: ${e}`);
}
}
}

let updateFile = () => {
jsonfile.writeFile(file, posts, function (err) {
console.error(err)
// save file locally
jsonfile.writeFile(file, posts, function (e) {
if (e) {
console.error(`updateFile err: ${e}`);
}
else {
console.log('wrote file successfully')
}
})
}

let triggerGithub = () => {
// * trigger GitHub Actions workflow API
fetch('https://api.github.com/repos/PDIS/web-jekyll/actions/workflows/github-pages-deploy.yml/dispatches', {
method: 'POST',
Expand All @@ -82,7 +99,7 @@ let gitcommit = () => {
ghrepo.contents('_data/tracks.json', function (err, data, headers) {
console.error("error: " + err);
if (typeof data == 'undefined' || typeof data === null) {
ghrepo.createContents('_data/tracks.json', 'update tracks.json', stringdata, function (err, data, headers) {
ghrepo.createContents('_data/tracks.json', 'create tracks.json', stringdata, function (err, data, headers) {
console.error("error: " + err);
console.error("data: " + JSON.stringify(data));
});
Expand All @@ -95,5 +112,8 @@ let gitcommit = () => {
});
}

// getIDs(more_url).then(getPosts).then(gitcommit)
getIDs(more_url).then(getPosts).then(updateFile)
getIDs()
.then(getPosts)
.then(updateFile)
// .then(triggerGithub)
// .then(gitcommit)
18 changes: 18 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "get-tracks-from-discourse",
"version": "1.0.0",
"description": "a simple script in order to fetch all the pages of topics in certain discourse category",
"main": "app.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"build": "node app.js"
},
"author": "pdis",
"license": "ISC",
"dependencies": {
"axios": "^1.3.2",
"jsonfile": "^6.1.0",
"yamljs": "^0.3.0"
}
}