init

man3kin3ko · May 13, 2022 · 8abd65d · 8abd65d
commit 8abd65d
Show file tree

Hide file tree

Showing 30 changed files with 963 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+.ipynb_checkpoints/
+outputs/
+checkpoints/*
+timing/*
+transcription_prepared/tmpfolder*
diff --git a/README.md b/README.md
@@ -0,0 +1,54 @@
+## Simple Antifraud
+****
+This is a simplified voice antifraud system created as part of bachelor's thesis at [Moscow Polytechnic University](https://mospolytech.ru/). The system is based on a pre-trained DeepSpeech model, Naive Bayes classifier and TF-IDF vectorizer.
+
+Project was done to illustrate the impact of performing adversarial attacks on this type of systems so it should not be used in production. Even if you think that DeepSpeech is protected enough, the classifier is vulnerable to the [Bayesian poisoning](https://en.wikipedia.org/wiki/Bayesian_poisoning) itself. 
+
+This is some kind of [Damn-Vulnerable Service](https://github.com/vavkamil/awesome-vulnerable-apps) so you can get a flag if you will properly abuse it.
+
+### Project structure
+
+- `checkpoints` contains .ckpt files of pretrained DeepSpeech models. Pretrained models can be found [here](releases).
+- `training` includes notebook with data preparation and fitting for NB Classifier and vectorizer.
+- `pickles` folder are used to store them.
+
+### Installation
+
+Install deepspeech.pytorch:
+
+```
+git clone https://github.com/SeanNaren/deepspeech.pytorch
+cd deepspeech.pytorch
+pip install -r requirements.txt
+pip install -e .
+```
+
+Clone this repository and run within it to install remaining dependencies:
+```
+pip install -r requirements.txt
+```
+
+### Mitigations
+
+The robustness of original [LibriSpeech model]() can be increased using adversarial retraining with gaussian data augmentation. The example model can be found in [ Releases](releases). You can also try to use another controls, described [here](https://www.enisa.europa.eu/publications/securing-machine-learning-algorithms).
+
+To retrain a model with a new data [original trainig script](https://github.com/SeanNaren/deepspeech.pytorch/blob/master/deepspeech_pytorch/training.py) can be used. Simply replace 
+```
+model = DeepSpeech(
+        labels=labels,
+        model_cfg=cfg.model,
+        optim_cfg=cfg.optim,
+        precision=cfg.trainer.precision,
+        spect_cfg=cfg.data.spect
+    )
+```
+with 
+
+```
+    model = DeepSpeech.load_from_checkpoint(
+        cfg.checkpoint.filepath,
+        freeze=True,
+        learning_rate=0.0001
+    )
+```
+so you can retrain it like `python3 train.py checkpoint.filepath=/path/to/file.ckpt`.
diff --git a/adversarial_generation.py b/adversarial_generation.py
@@ -0,0 +1,73 @@
+import time
+from os import path
+from typing import Callable, List
+import torch
+import torchaudio
+import numpy as np
+import art.estimators.speech_recognition as asr
+from art.attacks.evasion import ImperceptibleASRPyTorch
+
+import warnings
+warnings.filterwarnings(action='ignore')
+
+def save_to_txt(dest_dir, filename, content):
+    with open(path.join(dest_dir, filename), 'a') as f:
+        f.write(content)
+
+def time_to_file(
+    file_maker : Callable,
+    dest_dir : str = 'timing',
+):
+    def decorator(function):
+        def wrapper(model, filepath, transcription):
+            start_time = time.time()
+            result = function(model, filepath, transcription)
+            elapsed = time.time() - start_time
+            sample = path.split(filepath)[-1]
+            file_maker(
+                dest_dir, 
+                sample.split('.')[0],
+                f'Time elapsed: {elapsed}s for {sample}\n'
+            )
+            return result
+        return wrapper
+    return decorator
+
+@time_to_file(save_to_txt)
+def make_adversarial(model, filepath, labels):
+    adversarial = ImperceptibleASRPyTorch(model)
+    audio = load_np_audio(filepath)
+    waveform_np = adversarial.generate(audio, labels)
+    return waveform_np
+
+def create_args(sample_list, transcription):
+    transc = list()
+    for sample in sample_list:
+        rec = sample.split('_')[0]
+        if rec in transcription.keys():
+            transc.append(transcription[rec])
+    return zip(sample_list, transc)
+
+def load_np_audio(filepath):
+    return torchaudio.load(filepath)[0].numpy()
+
+def save_np_audio(array, filename, destdir):
+    tensor = torch.from_numpy(array)
+    filepath = path.join(destdir, filename)
+    torchaudio.save(filepath, tensor, 16000)
+
+def create_advs(
+    model : asr.PyTorchDeepSpeech,
+    source_dir : str, 
+    dest_dir : str, 
+    samples : List[str], 
+    transcriptions : List[str]
+) -> List[np.ndarray]:
+    advs = []
+    for args in create_args(samples, transcriptions):
+        advs.append(make_adversarial(model, 
+                         path.join(source_dir, args[0]), 
+                         np.array([args[1]])
+        ))
+        save_np_audio(advs[-1], args[0], dest_dir)
+    return advs
diff --git a/change_samplerate.sh b/change_samplerate.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+ORIG=$1 #directory that contain original audio
+SOX=$2 #direcory where converted audio will be placed
+
+subdirs=$(ls $ORIG);
+cd $ORIG
+for dir in $subdirs; do
+    mkdir "../$SOX/$dir"
+    for audio in $(ls $dir); do
+        sox -v 0.86 "$dir/$audio"\
+        --bits 16 --no-dither --compression 0.0\
+        "../$SOX/$dir/$audio"\
+        channels 1 rate 16000 || echo $audio corrupted
+    done
+done
diff --git a/data_maker.py b/data_maker.py
@@ -0,0 +1,113 @@
+import shutil
+from pathlib import Path
+import numpy as np
+from deepspeech_pytorch.data.utils import create_manifest
+from adversarial_generation import save_np_audio, load_np_audio
+
+class NoisePreprocessor:
+
+    HARD_NOISE = .1
+    MED_NOISE = .01
+    LIGHT_NOISE = .005
+
+    def __init__(self, sigma=.01):
+        self.sigma = sigma
+
+    def set_noise(self, sigma):
+        if sigma < 1:
+            self.sigma = sigma
+        else:
+            raise ValueError
+
+    def apply_noise(self, x):
+        noise = np.random.normal(x, self.sigma, x.shape)
+        return noise.astype(np.float32)
+
+class DataMaker():
+
+    LIBRISPEECH_MAX = 281241
+
+    def __init__(
+        self,
+        samples_folder : str,
+        dest_path : str,
+        manifest_path : str = 'manifests',
+        num_workers: int = 1
+        ):
+        self.num_workers = num_workers
+        self.manifest_path = Path(manifest_path)
+        self.dest_path = Path(dest_path)
+        self.dest_wav = self.dest_path / 'wav'
+        self.dest_txt = self.dest_path / 'txt'
+        self.samples_wav = Path(samples_folder) / 'wav'
+        self.samples_txt = Path(samples_folder) / 'txt'
+        self.prep = NoisePreprocessor()
+
+    def _create_manifest(self, prefix):
+        output_name = f'{prefix}_noise.json'
+        create_manifest(
+                        str(self.dest_path), 
+                        output_name,
+                        self.manifest_path,
+                        self.num_workers
+                        )
+
+    def _save_sample(self, name, text, audio):
+        save_np_audio(audio, f'{name}.wav', str(self.dest_wav))
+        (self.dest_txt / f'{name}.txt').write_text(text)
+
+    def _get_text(self, name):
+        return (self.samples_txt / f'{name}.txt').read_text()
+
+    def _apply_noise(self, sample, times):
+        audio = load_np_audio(sample)
+        name = sample.name.rstrip('.wav')
+        text = self._get_text(name)
+        for t in range(times):
+            audio_noised = self.prep.apply_noise(audio)
+            self._save_sample(f'{name}_{t}', text, audio)
+
+    def _make_dirs(self):
+        self.dest_path.mkdir()
+        self.dest_wav.mkdir()
+        self.dest_txt.mkdir()
+
+    def apply_noise(self, prefix='train', times=3):
+        self._make_dirs()
+        for sample in self.samples_wav.iterdir():
+            self._apply_noise(sample, times)
+        self._create_manifest(prefix)
+
+    def _val_random(self, size):
+        gen = np.random.default_rng()
+        return sorted(gen.choice(
+                self.LIBRISPEECH_MAX, 
+                size=size, 
+                replace=False
+                ))
+
+    def _copy_sample(self, wav):
+        name = wav.name.rstrip('.wav')
+        shutil.copy(str(wav), self.dest_wav)
+        txt = self.samples_txt / f'{name}.txt'
+        shutil.copy(str(txt), self.dest_txt)
+
+    def _create_random(self, size):
+        indexes = self._val_random(size)
+        for i, wav in enumerate(self.samples_wav.iterdir()):
+            if i in indexes:
+                self._copy_sample(wav)
+
+    def _create_iterate(self, size):
+        for i, wav in enumerate(self.samples_wav.iterdir()):
+            if i == size:
+                break
+            self._copy_sample(wav)
+
+    def create_vals(self, size, random=False):
+        self._make_dirs()
+        if random:
+            self._create_random(size)
+        else:
+            self._create_iterate(size)
+        self._create_manifest(prefix='val')