-
Notifications
You must be signed in to change notification settings - Fork 0
92 lines (72 loc) · 2.8 KB
/
ensembl_rapid_release.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
name: Retrieve GFF3 Paths from Ensembl RR
on:
push:
branches:
- main
workflow_dispatch:
jobs:
retrieve-gff3:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install lftp
run: sudo apt-get update && sudo apt-get install -y lftp
- name: Set Environment Variables
run: |
#ensembl ftp
echo "FTP=ftp.ensembl.org" >> $GITHUB_ENV
echo "FTP_URL=ftp://ftp.ensembl.org" >> $GITHUB_ENV
echo "DIR=/pub/rapid-release/species/" >> $GITHUB_ENV
#tmp files
echo "PATHS=paths.tsv" >> $GITHUB_ENV
echo "EXISTING_ACCESSIONS=existing_accessions.tsv" >> $GITHUB_ENV
echo "DIRS=directories.txt" >> $GITHUB_ENV
echo "SCRAPE=scrape.txt" >> $GITHUB_ENV
#output
echo "OUTPUT=ensembl_rapid_release.tsv" >> $GITHUB_ENV
- name: Get existing Assemblies or Initialize Output File
run: |
output="${{env.OUTPUT}}"
# Initialize the output file for FTP paths if it doesn't exist
if [[ ! -f "$output" ]]; then
echo -e "accession\tfull_path" > "$output"
fi
# Save existing accessions to a file, skipping the header
awk 'NR > 1 {print $1}' "$output" | sort > "${{env.EXISTING_ACCESSIONS}}"
- name: Scrape FTP directory
run: |
# Use lftp to connect to the server and list all files
lftp -c "
set net:timeout 10
set net:max-retries 3
open ${{env.FTP}}
cd ${{env.DIR}}
ls -R
quit
" > "${{env.SCRAPE}}"
- name: Filter new GFF3 paths
run: |
# Process the raw output to get only new .gff3.gz files (excluding existing accessions)
awk '
# When a directory name is printed (ends with a colon), store it
/:$/ {dir=substr($0, 1, length($0)-1); next}
# If the line contains .gff3.gz, print the full path (directory + filename)
/\.gff3\.gz$/ {print dir "/" $NF}
' "${{env.SCRAPE}}" | grep -vFf "${{env.EXISTING_ACCESSIONS}}" > "${{env.PATHS}}"
cat "${{env.PATHS}}"
rm "${{env.SCRAPE}}"
rm "${{env.EXISTING_ACCESSIONS}}"
- name: Map GFF3 paths to TSV file
run: |
# Map the filtered GFF3 paths to TSV file
awk -F'/' '{
accession = $3;
https_path = "https://ftp.ensembl.org/pub/rapid-release/species" substr($0, 2);
print accession "\t" https_path
}' "${{env.PATHS}}" >> "${{env.OUTPUT}}"
rm "${{env.PATHS}}"
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}