diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f59bb5c --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +.ipynb_checkpoints/ +outputs/ +checkpoints/* +timing/* +transcription_prepared/tmpfolder* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ba470cd --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +## Simple Antifraud +**** +This is a simplified voice antifraud system created as part of bachelor's thesis at [Moscow Polytechnic University](https://mospolytech.ru/). The system is based on a pre-trained DeepSpeech model, Naive Bayes classifier and TF-IDF vectorizer. + +Project was done to illustrate the impact of performing adversarial attacks on this type of systems so it should not be used in production. Even if you think that DeepSpeech is protected enough, the classifier is vulnerable to the [Bayesian poisoning](https://en.wikipedia.org/wiki/Bayesian_poisoning) itself. + +This is some kind of [Damn-Vulnerable Service](https://github.com/vavkamil/awesome-vulnerable-apps) so you can get a flag if you will properly abuse it. + +### Project structure + +- `checkpoints` contains .ckpt files of pretrained DeepSpeech models. Pretrained models can be found [here](releases). +- `training` includes notebook with data preparation and fitting for NB Classifier and vectorizer. +- `pickles` folder are used to store them. + +### Installation + +Install deepspeech.pytorch: + +``` +git clone https://github.com/SeanNaren/deepspeech.pytorch +cd deepspeech.pytorch +pip install -r requirements.txt +pip install -e . +``` + +Clone this repository and run within it to install remaining dependencies: +``` +pip install -r requirements.txt +``` + +### Mitigations + +The robustness of original [LibriSpeech model]() can be increased using adversarial retraining with gaussian data augmentation. The example model can be found in [ Releases](releases). You can also try to use another controls, described [here](https://www.enisa.europa.eu/publications/securing-machine-learning-algorithms). + +To retrain a model with a new data [original trainig script](https://github.com/SeanNaren/deepspeech.pytorch/blob/master/deepspeech_pytorch/training.py) can be used. Simply replace +``` +model = DeepSpeech( + labels=labels, + model_cfg=cfg.model, + optim_cfg=cfg.optim, + precision=cfg.trainer.precision, + spect_cfg=cfg.data.spect + ) +``` +with + +``` + model = DeepSpeech.load_from_checkpoint( + cfg.checkpoint.filepath, + freeze=True, + learning_rate=0.0001 + ) +``` +so you can retrain it like `python3 train.py checkpoint.filepath=/path/to/file.ckpt`. diff --git a/adversarial_generation.py b/adversarial_generation.py new file mode 100644 index 0000000..ac6c836 --- /dev/null +++ b/adversarial_generation.py @@ -0,0 +1,73 @@ +import time +from os import path +from typing import Callable, List +import torch +import torchaudio +import numpy as np +import art.estimators.speech_recognition as asr +from art.attacks.evasion import ImperceptibleASRPyTorch + +import warnings +warnings.filterwarnings(action='ignore') + +def save_to_txt(dest_dir, filename, content): + with open(path.join(dest_dir, filename), 'a') as f: + f.write(content) + +def time_to_file( + file_maker : Callable, + dest_dir : str = 'timing', +): + def decorator(function): + def wrapper(model, filepath, transcription): + start_time = time.time() + result = function(model, filepath, transcription) + elapsed = time.time() - start_time + sample = path.split(filepath)[-1] + file_maker( + dest_dir, + sample.split('.')[0], + f'Time elapsed: {elapsed}s for {sample}\n' + ) + return result + return wrapper + return decorator + +@time_to_file(save_to_txt) +def make_adversarial(model, filepath, labels): + adversarial = ImperceptibleASRPyTorch(model) + audio = load_np_audio(filepath) + waveform_np = adversarial.generate(audio, labels) + return waveform_np + +def create_args(sample_list, transcription): + transc = list() + for sample in sample_list: + rec = sample.split('_')[0] + if rec in transcription.keys(): + transc.append(transcription[rec]) + return zip(sample_list, transc) + +def load_np_audio(filepath): + return torchaudio.load(filepath)[0].numpy() + +def save_np_audio(array, filename, destdir): + tensor = torch.from_numpy(array) + filepath = path.join(destdir, filename) + torchaudio.save(filepath, tensor, 16000) + +def create_advs( + model : asr.PyTorchDeepSpeech, + source_dir : str, + dest_dir : str, + samples : List[str], + transcriptions : List[str] +) -> List[np.ndarray]: + advs = [] + for args in create_args(samples, transcriptions): + advs.append(make_adversarial(model, + path.join(source_dir, args[0]), + np.array([args[1]]) + )) + save_np_audio(advs[-1], args[0], dest_dir) + return advs diff --git a/change_samplerate.sh b/change_samplerate.sh new file mode 100644 index 0000000..12be5a5 --- /dev/null +++ b/change_samplerate.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +ORIG=$1 #directory that contain original audio +SOX=$2 #direcory where converted audio will be placed + +subdirs=$(ls $ORIG); +cd $ORIG +for dir in $subdirs; do + mkdir "../$SOX/$dir" + for audio in $(ls $dir); do + sox -v 0.86 "$dir/$audio"\ + --bits 16 --no-dither --compression 0.0\ + "../$SOX/$dir/$audio"\ + channels 1 rate 16000 || echo $audio corrupted + done +done \ No newline at end of file diff --git a/data_maker.py b/data_maker.py new file mode 100644 index 0000000..ad74eb4 --- /dev/null +++ b/data_maker.py @@ -0,0 +1,113 @@ +import shutil +from pathlib import Path +import numpy as np +from deepspeech_pytorch.data.utils import create_manifest +from adversarial_generation import save_np_audio, load_np_audio + +class NoisePreprocessor: + + HARD_NOISE = .1 + MED_NOISE = .01 + LIGHT_NOISE = .005 + + def __init__(self, sigma=.01): + self.sigma = sigma + + def set_noise(self, sigma): + if sigma < 1: + self.sigma = sigma + else: + raise ValueError + + def apply_noise(self, x): + noise = np.random.normal(x, self.sigma, x.shape) + return noise.astype(np.float32) + +class DataMaker(): + + LIBRISPEECH_MAX = 281241 + + def __init__( + self, + samples_folder : str, + dest_path : str, + manifest_path : str = 'manifests', + num_workers: int = 1 + ): + self.num_workers = num_workers + self.manifest_path = Path(manifest_path) + self.dest_path = Path(dest_path) + self.dest_wav = self.dest_path / 'wav' + self.dest_txt = self.dest_path / 'txt' + self.samples_wav = Path(samples_folder) / 'wav' + self.samples_txt = Path(samples_folder) / 'txt' + self.prep = NoisePreprocessor() + + def _create_manifest(self, prefix): + output_name = f'{prefix}_noise.json' + create_manifest( + str(self.dest_path), + output_name, + self.manifest_path, + self.num_workers + ) + + def _save_sample(self, name, text, audio): + save_np_audio(audio, f'{name}.wav', str(self.dest_wav)) + (self.dest_txt / f'{name}.txt').write_text(text) + + def _get_text(self, name): + return (self.samples_txt / f'{name}.txt').read_text() + + def _apply_noise(self, sample, times): + audio = load_np_audio(sample) + name = sample.name.rstrip('.wav') + text = self._get_text(name) + for t in range(times): + audio_noised = self.prep.apply_noise(audio) + self._save_sample(f'{name}_{t}', text, audio) + + def _make_dirs(self): + self.dest_path.mkdir() + self.dest_wav.mkdir() + self.dest_txt.mkdir() + + def apply_noise(self, prefix='train', times=3): + self._make_dirs() + for sample in self.samples_wav.iterdir(): + self._apply_noise(sample, times) + self._create_manifest(prefix) + + def _val_random(self, size): + gen = np.random.default_rng() + return sorted(gen.choice( + self.LIBRISPEECH_MAX, + size=size, + replace=False + )) + + def _copy_sample(self, wav): + name = wav.name.rstrip('.wav') + shutil.copy(str(wav), self.dest_wav) + txt = self.samples_txt / f'{name}.txt' + shutil.copy(str(txt), self.dest_txt) + + def _create_random(self, size): + indexes = self._val_random(size) + for i, wav in enumerate(self.samples_wav.iterdir()): + if i in indexes: + self._copy_sample(wav) + + def _create_iterate(self, size): + for i, wav in enumerate(self.samples_wav.iterdir()): + if i == size: + break + self._copy_sample(wav) + + def create_vals(self, size, random=False): + self._make_dirs() + if random: + self._create_random(size) + else: + self._create_iterate(size) + self._create_manifest(prefix='val') diff --git a/example.ipynb b/example.ipynb new file mode 100644 index 0000000..4eaabaa --- /dev/null +++ b/example.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import simple_antifraud as saf\n", + "\n", + "builder = saf.ModelBuilder()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание объекта антифрод системы с различными моделями. \n", + "Параметры:\n", + "- `verbose` - вывод транскрипции\n", + "- `preprocessor` - зашумление записи перед предсказанием" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "antifraud = saf.SimpleAntifraud([\n", + " saf.SimpleAntifraudPart(builder.get_regular(), verbose=True),\n", + " saf.SimpleAntifraudPart(builder.get_regular(), preprocessor=True, verbose=True),\n", + " saf.SimpleAntifraudPart(builder.get_retrain('gauss_retrain', 'checkpoints/noised.ckpt'), verbose=True),\n", + " saf.SimpleAntifraudPart(builder.get_retrain('adv_retrain', 'checkpoints/noised_advs.ckpt'), verbose=True),\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Доступные модели:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'regular': ,\n", + " 'gauss': ,\n", + " 'gauss_retrain': ,\n", + " 'adv_retrain': }" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "antifraud.parts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предсказание обычной записи оригинальной моделью:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(False, ['thankful', 'that', 'one', 'of', \"roy's\", 'sisters', 'was', 'liable'])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "antifraud.check_regular('transcription_prepared/normal/112-123215-0030.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предсказание мошеннической записи оригинальной моделью:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(True,\n", + " \"hello sir i'm from bank please tell me your can number for a bank purification\")" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ans = antifraud.check_regular('transcription_prepared/fraud/cvv_indian.wav')\n", + "ans[0], \" \".join(ans[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предсказание мошеннической записи с состязательным шумом оригинальной моделью:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(False,\n", + " 'hello honey how are you i hope you have a nice weekend with your team mates')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ans = antifraud.check_regular('transcription_prepared/adversarial/cvv_indian.wav')\n", + "ans[0], \" \".join(ans[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предсказание мошеннической записи с состязательным шумом защищенной моделью:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(True,\n", + " \"hello sir i'm from bank please tell me your can number for a bank purification\")" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ans = antifraud.check_adv_retrain('transcription_prepared/adversarial/cvv_indian.wav')\n", + "ans[0], \" \".join(ans[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Показать предсказания всех доступных моделей:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "regular model prediction: False\n", + "Transcription: hello honey how are you i hope you have a nice weekend with your team mates\n", + "gauss model prediction: False\n", + "Transcription: iron short am ally e tell me your same are number for avertigation\n", + "gauss_retrain model prediction: True\n", + "Transcription: hello sir i'm from bank please tell me your can number for a bank purification\n", + "adv_retrain model prediction: True\n", + "Transcription: hello sir i'm from bank please tell me your can number for a bank purification\n" + ] + } + ], + "source": [ + "for ans in antifraud.check_all('transcription_prepared/adversarial/cvv_indian.wav'):\n", + " print(f'{ans[0]} model prediction: {ans[1][0]}')\n", + " print(f'Transcription: {\" \".join(ans[1][1])}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pickles/classifier.pkl b/pickles/classifier.pkl new file mode 100644 index 0000000..8c80f0f Binary files /dev/null and b/pickles/classifier.pkl differ diff --git a/pickles/vectorizer.pkl b/pickles/vectorizer.pkl new file mode 100644 index 0000000..bedc4ee Binary files /dev/null and b/pickles/vectorizer.pkl differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c9c4ae0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +setuptools==60.7.0 +zipp==3.1.0 +pyyaml==5.4 +adversarial-robustness-toolbox==1.10.0 +textblob==0.17.1 \ No newline at end of file diff --git a/simple_antifraud.py b/simple_antifraud.py new file mode 100644 index 0000000..22476f6 --- /dev/null +++ b/simple_antifraud.py @@ -0,0 +1,138 @@ +import torchaudio +import pickle +import numpy as np +from textblob import TextBlob +from data_maker import NoisePreprocessor +import art.estimators.speech_recognition as asr +from deepspeech_pytorch.model import DeepSpeech + +class ModelBuilder: + def __init__( + self, + pretrained_art = 'librispeech', + classifier_path = 'pickles/classifier.pkl', + vectorizer_path = 'pickles/vectorizer.pkl', + ): + self.pretrained = pretrained_art + try: + with open(classifier_path, 'rb') as fid: + self.clfr = pickle.load(fid) + with open(vectorizer_path, 'rb') as fid: + self.vc = pickle.load(fid) + except: + print('Pickled objects corrupted') + exit(1) + + def _load_ckpt(self, checkpoint): + model = DeepSpeech.load_from_checkpoint(checkpoint) + return asr.PyTorchDeepSpeech(model=model) + + def _load_art(self): + return asr.PyTorchDeepSpeech(pretrained_model=self.pretrained) + + def _set_helpers(self, loaded): + setattr(loaded, 'vc', self.vc) + setattr(loaded, 'clfr', self.clfr) + return loaded + + def get_regular(self): + regular = self._load_art() + setattr(regular, 'type', 'regular') + return self._set_helpers(regular) + + def get_retrain(self, type, checkpoint): + retrain = self._load_ckpt(checkpoint) + setattr(retrain, 'type', type) + return self._set_helpers(retrain) + + +class SimpleAntifraudPart: + def __init__( + self, + model, + verbose = False, + preprocessor = False, + ): + self.model = model + self.verbose = verbose + if preprocessor: + noise = NoisePreprocessor.MED_NOISE + self.preprocessor = NoisePreprocessor(noise) + setattr(self.model, 'type', 'gauss') + else: + self.preprocessor = None + + def _get_predictor(self, audio): + if self.preprocessor: + return self._predict( + self.preprocessor.apply_noise(audio) + ) + return self._predict(audio) + + def get_type(self): + return self.model.type + + def _predict(self, audio): + return self.model.predict(audio) + + def predict(self, filepath): + audio = torchaudio.load(filepath)[0].numpy() + return self._get_predictor(audio) + + def _transcribe(self, filepath): + return self.predict(filepath)[0].lower()\ + .split()\ + + def _correct(self, text): + return [ + str(TextBlob(word).correct()) \ + for word in text + ] + + def check_audio(self, filepath) -> bool: + input_text = self._transcribe(filepath) + if input_text == 'ADVERSARIAL': + return False, 'You get a flag'.split() + input_corrected = self._correct(input_text) + + pipe = sum( + self.model.clfr.predict( + self.model.vc.transform( + input_corrected + ))) + + ans = pipe > 0 + return ans if not self.verbose else ans, input_corrected + +class SimpleAntifraud(): + def __init__(self, parts): + self.parts = dict() + for part in parts: + self.parts[part.get_type()] = part + + def _check_audio(self, filepath, part_type) -> bool: + return self.parts[part_type].check_audio(filepath) + + def _check_partly(self, filepath, type): + if type in self.parts.keys(): + return self._check_audio(filepath, type) + else: + raise NameError('Model not initializated') + + def check_gauss(self, filepath): + return self._check_partly(filepath, 'gauss') + + def check_gauss_retrain(self, filepath): + return self._check_partly(filepath, 'gauss_retrain') + + def check_adv_retrain(self, filepath): + return self._check_partly(filepath, 'adv_retrain') + + def check_regular(self, filepath): + return self._check_partly(filepath, 'regular') + + def check_all(self, filepath): + types = self.parts.keys() + return zip(types, + [self._check_audio(filepath, t) for t in types] + ) diff --git a/training/Bayes-part.ipynb b/training/Bayes-part.ipynb new file mode 100644 index 0000000..9ddc6f1 --- /dev/null +++ b/training/Bayes-part.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf373876", + "metadata": {}, + "source": [ + "Download training dataset: [fraud_call.file](https://www.kaggle.com/code/narayanyadav/detect-fraud-call/data)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "89dfddad", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54809d79", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('fraud_call.file',\n", + " sep='\\t',\n", + " header=None, on_bad_lines='skip',\n", + " names=['label','content'])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a05bf78d", + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import stopwords # non informative most common words like 'the', 'is'\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "lemmatizer = WordNetLemmatizer()\n", + "vectorizer = TfidfVectorizer(max_features=1500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77ce9e2b", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "nltk.download('omw-1.4')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "452bc152", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from num2words import num2words\n", + "\n", + "def remove_special_characters(column):\n", + " special_characters = r'[£$&+,:;=?@#|<>.^*()%!-]'\n", + " return re.sub(special_characters, '', column)\n", + "\n", + "def remove_special_escaped(column):\n", + " escaped = ['<', '>', '&']\n", + " for i in escaped: column = column.replace(i, '')\n", + " return column\n", + "\n", + "def many_nums(nums : str):\n", + " return ' '.join([num2words(i) for i in nums])\n", + "\n", + "def less_nums(nums : str):\n", + " return num2words(nums)\n", + "\n", + "def cost_speaked(column):\n", + " digits_comma_separated = r\"((\\d*\\.?\\d+|\\d{1,3}(,\\d{3})*(\\.\\d+)?))\"\n", + " return re.sub(digits_comma_separated,\n", + " lambda x: less_nums(x.group()), column)\n", + "\n", + "def phones_speaked(column):\n", + " more_than_four_digits = r\"\\d{4,}\"\n", + " return re.sub(more_than_four_digits, \n", + " lambda x: many_nums(x.group()), column)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c5895287", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'TWO THOUSAND'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cost_speaked('2000').upper()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "4d8881ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'todays vodafone numbers ending with 4882 are selected to a receive a 350 award if your number matches call 09064019014 to receive your 350 award'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_special_characters(data.content[1].lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "43fc86ac", + "metadata": {}, + "outputs": [], + "source": [ + "# firstly I think to use .replace but it operates only full column content\n", + "data_speaked = data[['content']].applymap(lambda x: x.lower())\\\n", + " .applymap(remove_special_escaped)\\\n", + " .applymap(remove_special_characters)\\\n", + " .applymap(phones_speaked)\\\n", + " .applymap(cost_speaked)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "74af2bfa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'todays vodafone numbers ending with four eight eight two are selected to a receive a three hundred and fifty award if your number matches call zero nine zero six four zero one nine zero one four to receive your three hundred and fifty award'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_speaked.content[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "82ac3bc2", + "metadata": {}, + "outputs": [], + "source": [ + "def lemmatize(column):\n", + " return ' '.join([lemmatizer.lemmatize(word) for word in column.split() \\\n", + " if word not in stopwords.words('english')])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "249dc6fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'today vodafone number ending four eight eight two selected receive three hundred fifty award number match call zero nine zero six four zero one nine zero one four receive three hundred fifty award'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_lemmatized = data_speaked.applymap(lemmatize)\n", + "data_lemmatized.content[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c5996a41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 hello bank manager sbi ur debit card expire wo...\n", + "1 today vodafone number ending four eight eight ...\n", + "2 please say like hi hi hi\n", + "3 thank\n", + "4 oh forwarded message thought send\n", + " ... \n", + "5919 get one zero zero zero inr voucher please call...\n", + "5920 get free access google cloud account hit given...\n", + "5921 get free aws cloud account hit given message b...\n", + "5922 get free access microsoft azure hit given mess...\n", + "5923 hello sir bank fill application form credit ca...\n", + "Name: content, Length: 5924, dtype: object" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_final = data_lemmatized.copy()\n", + "data_final['label'] = pd.get_dummies(data.label).fraud\n", + "data_final.to_csv('fraud_clear.csv') #remove junk lines (with decimal, etc) later \n", + "data_final.content" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "5f9660f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9756920999324781" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "x = vectorizer.fit_transform(data_final.content).toarray()\n", + "x_train, x_test, y_train, y_test = train_test_split(x, data_final.label, \n", + " test_size=.25, random_state=50)\n", + "fraud_classifier = MultinomialNB().fit(x_train, y_train)\n", + "fraud_classifier.score(x_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ac90ff64", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('classifier.pkl', 'wb') as fid:\n", + " pickle.dump(fraud_classifier, fid)\n", + "with open('vectorizer.pkl', 'wb') as fid:\n", + " pickle.dump(vectorizer, fid)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transcription_prepared/adversarial/amazon_elena.wav b/transcription_prepared/adversarial/amazon_elena.wav new file mode 100644 index 0000000..be2095b Binary files /dev/null and b/transcription_prepared/adversarial/amazon_elena.wav differ diff --git a/transcription_prepared/adversarial/cloud_speaker2.wav b/transcription_prepared/adversarial/cloud_speaker2.wav new file mode 100644 index 0000000..e5bcb0f Binary files /dev/null and b/transcription_prepared/adversarial/cloud_speaker2.wav differ diff --git a/transcription_prepared/adversarial/cvv_indian.wav b/transcription_prepared/adversarial/cvv_indian.wav new file mode 100644 index 0000000..bec0a0c Binary files /dev/null and b/transcription_prepared/adversarial/cvv_indian.wav differ diff --git a/transcription_prepared/adversarial/mumbai_speaker2.wav b/transcription_prepared/adversarial/mumbai_speaker2.wav new file mode 100644 index 0000000..d6d23ea Binary files /dev/null and b/transcription_prepared/adversarial/mumbai_speaker2.wav differ diff --git a/transcription_prepared/adversarial/urgent_elena.wav b/transcription_prepared/adversarial/urgent_elena.wav new file mode 100644 index 0000000..a63ca6c Binary files /dev/null and b/transcription_prepared/adversarial/urgent_elena.wav differ diff --git a/transcription_prepared/fraud/amazon_elena.wav b/transcription_prepared/fraud/amazon_elena.wav new file mode 100644 index 0000000..90e2bf2 Binary files /dev/null and b/transcription_prepared/fraud/amazon_elena.wav differ diff --git a/transcription_prepared/fraud/amazon_speaker1.wav b/transcription_prepared/fraud/amazon_speaker1.wav new file mode 100644 index 0000000..74be2d9 Binary files /dev/null and b/transcription_prepared/fraud/amazon_speaker1.wav differ diff --git a/transcription_prepared/fraud/amazon_speaker3.wav b/transcription_prepared/fraud/amazon_speaker3.wav new file mode 100644 index 0000000..76a576d Binary files /dev/null and b/transcription_prepared/fraud/amazon_speaker3.wav differ diff --git a/transcription_prepared/fraud/cloud_speaker2.wav b/transcription_prepared/fraud/cloud_speaker2.wav new file mode 100644 index 0000000..4213a30 Binary files /dev/null and b/transcription_prepared/fraud/cloud_speaker2.wav differ diff --git a/transcription_prepared/fraud/cvv_elena.wav b/transcription_prepared/fraud/cvv_elena.wav new file mode 100644 index 0000000..c367b06 Binary files /dev/null and b/transcription_prepared/fraud/cvv_elena.wav differ diff --git a/transcription_prepared/fraud/cvv_indian.wav b/transcription_prepared/fraud/cvv_indian.wav new file mode 100644 index 0000000..192b924 Binary files /dev/null and b/transcription_prepared/fraud/cvv_indian.wav differ diff --git a/transcription_prepared/fraud/mumbai_speaker2.wav b/transcription_prepared/fraud/mumbai_speaker2.wav new file mode 100644 index 0000000..0c3b08b Binary files /dev/null and b/transcription_prepared/fraud/mumbai_speaker2.wav differ diff --git a/transcription_prepared/fraud/urgent_elena.wav b/transcription_prepared/fraud/urgent_elena.wav new file mode 100644 index 0000000..4c0cdbd Binary files /dev/null and b/transcription_prepared/fraud/urgent_elena.wav differ diff --git a/transcription_prepared/normal/1034-121119-0039.wav b/transcription_prepared/normal/1034-121119-0039.wav new file mode 100644 index 0000000..7d07192 Binary files /dev/null and b/transcription_prepared/normal/1034-121119-0039.wav differ diff --git a/transcription_prepared/normal/1092-134579-0025.wav b/transcription_prepared/normal/1092-134579-0025.wav new file mode 100644 index 0000000..c89ef40 Binary files /dev/null and b/transcription_prepared/normal/1092-134579-0025.wav differ diff --git a/transcription_prepared/normal/112-123215-0030.wav b/transcription_prepared/normal/112-123215-0030.wav new file mode 100644 index 0000000..3000357 Binary files /dev/null and b/transcription_prepared/normal/112-123215-0030.wav differ diff --git a/transcription_prepared/normal/1296-138074-0059.wav b/transcription_prepared/normal/1296-138074-0059.wav new file mode 100644 index 0000000..0a8a76d Binary files /dev/null and b/transcription_prepared/normal/1296-138074-0059.wav differ diff --git a/transcription_prepared/normal/1425-139297-0018.wav b/transcription_prepared/normal/1425-139297-0018.wav new file mode 100644 index 0000000..8aac86c Binary files /dev/null and b/transcription_prepared/normal/1425-139297-0018.wav differ diff --git a/transcription_prepared/normal/1633-141584-0018.wav b/transcription_prepared/normal/1633-141584-0018.wav new file mode 100644 index 0000000..f92e2a0 Binary files /dev/null and b/transcription_prepared/normal/1633-141584-0018.wav differ