Skip to content

Commit

Permalink
FAERS scripts updated
Browse files Browse the repository at this point in the history
  • Loading branch information
bhlieberman committed Jul 10, 2024
1 parent 23f6144 commit a45b14c
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 57 deletions.
59 changes: 10 additions & 49 deletions stages/01_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,61 +9,22 @@ echo "Local path: $localpath"
# Create the list directory to save list of remote files and directories
listpath="$localpath/list"
echo "List path: $listpath"
mkdir -p $listpath
cd $listpath;
mkdir -p "$listpath"
cd "$listpath" || exit;

URL="https://fis.fda.gov/content/Exports/"
wget --no-remove-listing https://fis.fda.gov/extensions/FPD-QDE-FAERS/FPD-QDE-FAERS.html
cat FPD-QDE-FAERS.html | grep -Po '(?<=href=")[^"]*ascii[^"]*\.zip' | sort | cut -d "/" -f 6 > files.txt
rm "$localpath"/FPD-QDE-FAERS.html

# Create the download directory
downloadpath="$localpath/download"
echo "Download path: $downloadpath"
mkdir -p "$downloadpath"
cd $downloadpath;

# Download files
URL="https://fis.fda.gov/content/Exports/"
Year=2004
Quarter=1
FileName="aers_ascii_${Year}q${Quarter}.zip"
while true;
do
if ! wget --spider -w 2 -t 2 "$URL$FileName" 2>/dev/null;
then
break
fi
echo $FileName
echo $FileName >> "${listpath}/files.txt"
if [[ $Quarter -ge 4 ]]
then
let Year=$Year+1
Quarter=1
else
let Quarter=$Quarter+1
fi
FileName="aers_ascii_${Year}q${Quarter}.zip"
done
Year=2012
Quarter=4
FileName="faers_ascii_${Year}q${Quarter}.zip"
while true;
do
if ! wget --spider -w 2 -t 2 "$URL$FileName" 2>/dev/null;
then
break
fi
echo $FileName
echo $FileName >> "${listpath}/files.txt"
if [[ $Quarter -ge 4 ]]
then
let Year=$Year+1
Quarter=1
else
let Quarter=$Quarter+1
fi
FileName="faers_ascii_${Year}q${Quarter}.zip"
done
cd "$downloadpath" || exit;

# Download files in parallel
cat $listpath/files.txt | xargs -P14 -n1 bash -c '
echo $1
cat "$listpath"/files.txt | xargs -P14 -n1 bash -c '
echo $URL$1
wget -nH -q -nc -P '$downloadpath' '$URL'$1' {}

echo "Download done."
4 changes: 2 additions & 2 deletions stages/02_unzip.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ mkdir -p $rawpath
echo "Raw path: $rawpath"

# Unzip files in parallel
cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c '
filename="${1%.*}"
cat $listpath/files.txt | head -n -3 | xargs -P14 -n1 bash -c '
filename=$(basename "$1" .zip)
echo '$downloadpath'/$1
echo '$rawpath'/$filename
unzip '$downloadpath'/$1 -d '$rawpath'/$filename
Expand Down
10 changes: 4 additions & 6 deletions stages/03_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ echo "Brick path: $brickpath"
# calling a Python function with arguments input and output filenames
for class in drug demo indi outc reac rpsr
do
mkdir -p $brickpath/$class.parquet
index=0
for infile in `find $rawpath -type f -iname $class*.txt | sort`
mkdir -p "$brickpath"/$class.parquet
for infile in $(find "$rawpath" -type f -iname "$class*.txt" | sort)
do
outfile="$brickpath/$class.parquet/$class`printf %03d $index`.parquet"
python stages/csv2parquet.py $infile $outfile
let index=$index+1
outfile="$brickpath/$class.parquet/$(basename "$infile" .txt).parquet"
python3 stages/csv2parquet.py "$infile" "$outfile"
done
done

0 comments on commit a45b14c

Please sign in to comment.