Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
man3kin3ko committed May 13, 2022
0 parents commit 8abd65d
Show file tree
Hide file tree
Showing 30 changed files with 963 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__pycache__/
.ipynb_checkpoints/
outputs/
checkpoints/*
timing/*
transcription_prepared/tmpfolder*
54 changes: 54 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
## Simple Antifraud
****
This is a simplified voice antifraud system created as part of bachelor's thesis at [Moscow Polytechnic University](https://mospolytech.ru/). The system is based on a pre-trained DeepSpeech model, Naive Bayes classifier and TF-IDF vectorizer.

Project was done to illustrate the impact of performing adversarial attacks on this type of systems so it should not be used in production. Even if you think that DeepSpeech is protected enough, the classifier is vulnerable to the [Bayesian poisoning](https://en.wikipedia.org/wiki/Bayesian_poisoning) itself.

This is some kind of [Damn-Vulnerable Service](https://github.com/vavkamil/awesome-vulnerable-apps) so you can get a flag if you will properly abuse it.

### Project structure

- `checkpoints` contains .ckpt files of pretrained DeepSpeech models. Pretrained models can be found [here](releases).
- `training` includes notebook with data preparation and fitting for NB Classifier and vectorizer.
- `pickles` folder are used to store them.

### Installation

Install deepspeech.pytorch:

```
git clone https://github.com/SeanNaren/deepspeech.pytorch
cd deepspeech.pytorch
pip install -r requirements.txt
pip install -e .
```

Clone this repository and run within it to install remaining dependencies:
```
pip install -r requirements.txt
```

### Mitigations

The robustness of original [LibriSpeech model]() can be increased using adversarial retraining with gaussian data augmentation. The example model can be found in [ Releases](releases). You can also try to use another controls, described [here](https://www.enisa.europa.eu/publications/securing-machine-learning-algorithms).

To retrain a model with a new data [original trainig script](https://github.com/SeanNaren/deepspeech.pytorch/blob/master/deepspeech_pytorch/training.py) can be used. Simply replace
```
model = DeepSpeech(
labels=labels,
model_cfg=cfg.model,
optim_cfg=cfg.optim,
precision=cfg.trainer.precision,
spect_cfg=cfg.data.spect
)
```
with

```
model = DeepSpeech.load_from_checkpoint(
cfg.checkpoint.filepath,
freeze=True,
learning_rate=0.0001
)
```
so you can retrain it like `python3 train.py checkpoint.filepath=/path/to/file.ckpt`.
73 changes: 73 additions & 0 deletions adversarial_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import time
from os import path
from typing import Callable, List
import torch
import torchaudio
import numpy as np
import art.estimators.speech_recognition as asr
from art.attacks.evasion import ImperceptibleASRPyTorch

import warnings
warnings.filterwarnings(action='ignore')

def save_to_txt(dest_dir, filename, content):
with open(path.join(dest_dir, filename), 'a') as f:
f.write(content)

def time_to_file(
file_maker : Callable,
dest_dir : str = 'timing',
):
def decorator(function):
def wrapper(model, filepath, transcription):
start_time = time.time()
result = function(model, filepath, transcription)
elapsed = time.time() - start_time
sample = path.split(filepath)[-1]
file_maker(
dest_dir,
sample.split('.')[0],
f'Time elapsed: {elapsed}s for {sample}\n'
)
return result
return wrapper
return decorator

@time_to_file(save_to_txt)
def make_adversarial(model, filepath, labels):
adversarial = ImperceptibleASRPyTorch(model)
audio = load_np_audio(filepath)
waveform_np = adversarial.generate(audio, labels)
return waveform_np

def create_args(sample_list, transcription):
transc = list()
for sample in sample_list:
rec = sample.split('_')[0]
if rec in transcription.keys():
transc.append(transcription[rec])
return zip(sample_list, transc)

def load_np_audio(filepath):
return torchaudio.load(filepath)[0].numpy()

def save_np_audio(array, filename, destdir):
tensor = torch.from_numpy(array)
filepath = path.join(destdir, filename)
torchaudio.save(filepath, tensor, 16000)

def create_advs(
model : asr.PyTorchDeepSpeech,
source_dir : str,
dest_dir : str,
samples : List[str],
transcriptions : List[str]
) -> List[np.ndarray]:
advs = []
for args in create_args(samples, transcriptions):
advs.append(make_adversarial(model,
path.join(source_dir, args[0]),
np.array([args[1]])
))
save_np_audio(advs[-1], args[0], dest_dir)
return advs
16 changes: 16 additions & 0 deletions change_samplerate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

ORIG=$1 #directory that contain original audio
SOX=$2 #direcory where converted audio will be placed

subdirs=$(ls $ORIG);
cd $ORIG
for dir in $subdirs; do
mkdir "../$SOX/$dir"
for audio in $(ls $dir); do
sox -v 0.86 "$dir/$audio"\
--bits 16 --no-dither --compression 0.0\
"../$SOX/$dir/$audio"\
channels 1 rate 16000 || echo $audio corrupted
done
done
113 changes: 113 additions & 0 deletions data_maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import shutil
from pathlib import Path
import numpy as np
from deepspeech_pytorch.data.utils import create_manifest
from adversarial_generation import save_np_audio, load_np_audio

class NoisePreprocessor:

HARD_NOISE = .1
MED_NOISE = .01
LIGHT_NOISE = .005

def __init__(self, sigma=.01):
self.sigma = sigma

def set_noise(self, sigma):
if sigma < 1:
self.sigma = sigma
else:
raise ValueError

def apply_noise(self, x):
noise = np.random.normal(x, self.sigma, x.shape)
return noise.astype(np.float32)

class DataMaker():

LIBRISPEECH_MAX = 281241

def __init__(
self,
samples_folder : str,
dest_path : str,
manifest_path : str = 'manifests',
num_workers: int = 1
):
self.num_workers = num_workers
self.manifest_path = Path(manifest_path)
self.dest_path = Path(dest_path)
self.dest_wav = self.dest_path / 'wav'
self.dest_txt = self.dest_path / 'txt'
self.samples_wav = Path(samples_folder) / 'wav'
self.samples_txt = Path(samples_folder) / 'txt'
self.prep = NoisePreprocessor()

def _create_manifest(self, prefix):
output_name = f'{prefix}_noise.json'
create_manifest(
str(self.dest_path),
output_name,
self.manifest_path,
self.num_workers
)

def _save_sample(self, name, text, audio):
save_np_audio(audio, f'{name}.wav', str(self.dest_wav))
(self.dest_txt / f'{name}.txt').write_text(text)

def _get_text(self, name):
return (self.samples_txt / f'{name}.txt').read_text()

def _apply_noise(self, sample, times):
audio = load_np_audio(sample)
name = sample.name.rstrip('.wav')
text = self._get_text(name)
for t in range(times):
audio_noised = self.prep.apply_noise(audio)
self._save_sample(f'{name}_{t}', text, audio)

def _make_dirs(self):
self.dest_path.mkdir()
self.dest_wav.mkdir()
self.dest_txt.mkdir()

def apply_noise(self, prefix='train', times=3):
self._make_dirs()
for sample in self.samples_wav.iterdir():
self._apply_noise(sample, times)
self._create_manifest(prefix)

def _val_random(self, size):
gen = np.random.default_rng()
return sorted(gen.choice(
self.LIBRISPEECH_MAX,
size=size,
replace=False
))

def _copy_sample(self, wav):
name = wav.name.rstrip('.wav')
shutil.copy(str(wav), self.dest_wav)
txt = self.samples_txt / f'{name}.txt'
shutil.copy(str(txt), self.dest_txt)

def _create_random(self, size):
indexes = self._val_random(size)
for i, wav in enumerate(self.samples_wav.iterdir()):
if i in indexes:
self._copy_sample(wav)

def _create_iterate(self, size):
for i, wav in enumerate(self.samples_wav.iterdir()):
if i == size:
break
self._copy_sample(wav)

def create_vals(self, size, random=False):
self._make_dirs()
if random:
self._create_random(size)
else:
self._create_iterate(size)
self._create_manifest(prefix='val')
Loading

0 comments on commit 8abd65d

Please sign in to comment.