-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #183 from eye-on-surveillance/main
release: focus group 2
- Loading branch information
Showing
28 changed files
with
1,796 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
packages/googlecloud/functions/getanswer/process_public_queries.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import requests | ||
import csv | ||
import json | ||
from tqdm import tqdm | ||
|
||
# Input CSV file with 'title' column | ||
input_csv = "/Users/haydenoutlaw/Desktop/card_rows_export_2023-11-29.csv" | ||
output_csv = "/Users/haydenoutlaw/Desktop/gpt4-varied-11-29.csv" | ||
|
||
# point to getanswer server | ||
api_endpoint = "http://localhost:8080" | ||
|
||
# list of k values | ||
k_list = [5, 10, 15] | ||
|
||
# get response from local getanswer server, store answers | ||
def make_api_call(title, k_inp): | ||
payload = {"query": title, "response_type": "in_depth", "card_id": 1, "k": k_inp} | ||
response = requests.post(f"{api_endpoint}", json=payload) | ||
rdict = json.loads(response.text) | ||
card_type_out = rdict["card_type"] | ||
citations_out = rdict["citations"] | ||
responses_out = rdict["responses"] | ||
return card_type_out, citations_out, responses_out, k_inp | ||
|
||
# Open CSV file in append mode | ||
with open(output_csv, 'a', newline='', encoding='utf-8') as csv_file: | ||
# define csv out file | ||
csv_writer = csv.writer(csv_file) | ||
csv_writer.writerow(["query", "response_id", "card_type", "citations", "responses", "k"]) | ||
|
||
# read inputs | ||
df = pd.read_csv(input_csv) | ||
|
||
|
||
print("Connected to getanswer at", api_endpoint) | ||
print("K Values", k_list) | ||
print("Generating Responses....") | ||
|
||
|
||
# for all queries, get answers and write out one at a time | ||
tqiter = enumerate(tqdm(df["title"])) | ||
for i, query in tqiter: | ||
for k_val in k_list: | ||
card_type, citations, responses, k = make_api_call(query, k_val) | ||
csv_writer.writerow([query, i, card_type, citations, responses, k]) | ||
|
||
print(f"Results saved to '{output_csv}'.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.env | ||
.log | ||
__pycache__/ | ||
transcripts-data/ | ||
audio/ | ||
cred/ | ||
.vscode/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
## TU Capstone- Transcription | ||
|
||
A generic API for fetching YouTube Audio and Transcripts. | ||
|
||
#### Required Credentials | ||
- YOUTUBE_API_KEY | ||
- GOOGLE_APPLICATION_CREDENTIALS | ||
Create a cred folder containing cred.env variables according to dotenv configuration. | ||
|
||
### transcripts.py | ||
Retrieves & downloads the x-most recent video transcripts from a YouTube Channel. | ||
|
||
### monitor.py | ||
Retrieves & downloads the x-most recent video audio mp4s from a YouTube Channel. Future implemention should consider using Windows Task Scheduler to periodically monitor channel for new videos. | ||
|
||
#### Oauth.py | ||
Helper authentication function. | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from googleapiclient.discovery import build | ||
#import youtube_dl Has BEEN DEPRECATED BY GERMAN GOVERNMENT | ||
import os | ||
from dotenv import load_dotenv | ||
from pytube import YouTube | ||
import oauth | ||
# Initialize the YouTube Data API client | ||
|
||
env_vars = oauth.import_env_vars() | ||
YOUTUBE_API_KEY = env_vars.get('YOUTUBE_API_KEY') | ||
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) | ||
|
||
# Specify the YouTube channel ID | ||
channel_id = 'UC8oPEsQe9a0v6TdJ4K_QXoA' # New Orleans City Council | ||
|
||
def get_latest_videos(channel_id, max_results=5): | ||
""" | ||
Fetches the latest x-number of videos from a YouTube channel. | ||
Args: | ||
channel_id (str): The ID of the YouTube channel to monitor. | ||
max_results (int): The maximum number of latest videos to fetch. Default is 5. | ||
Returns: | ||
list: A list of video IDs for the latest videos. | ||
""" | ||
# Fetch channel details to get the ID of the uploads playlist | ||
request = youtube.channels().list( | ||
part='contentDetails', | ||
id=channel_id | ||
) | ||
response = request.execute() | ||
|
||
if not response.get('items'): | ||
raise ValueError(f"No channel found with ID {channel_id}") | ||
|
||
playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads'] | ||
|
||
request = youtube.playlistItems().list( | ||
part='snippet', | ||
playlistId=playlist_id, | ||
maxResults=max_results | ||
) | ||
response = request.execute() | ||
|
||
video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']] | ||
|
||
return video_ids | ||
|
||
def download_audio(video_ids): | ||
""" | ||
Downloads the audio of a list of YouTube videos using pytube. | ||
Args: | ||
video_ids (list): A list of YouTube video IDs to download the audio for. | ||
Downloads: mp4 audio files of the desired Youtube videos. | ||
""" | ||
for video_id in video_ids: | ||
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}') | ||
ys = yt.streams.filter(only_audio=True).first() | ||
|
||
# Download the audio stream to the specified output path | ||
print(f'Downloading audio for {video_id}...') | ||
ys.download(output_path=r'transcripts-data\audio', filename=video_id+".mp4") | ||
|
||
# Get the latest videos | ||
video_ids = get_latest_videos(channel_id, 10) | ||
|
||
# Download the audio of the new videos | ||
download_audio(video_ids) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
|
||
import os | ||
from dotenv import load_dotenv | ||
|
||
def import_env_vars(): | ||
os.chdir(r"packages\transcription") | ||
load_dotenv(r"cred\cred.env") | ||
|
||
# Get credentials from environment variables | ||
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") | ||
CLIENT_ID = os.getenv("CLIENT_ID") | ||
CLIENT_SECRET = os.getenv("CLIENT_SECRET") | ||
GOOGLE_APPLICATION_CREDENTIALS= os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | ||
|
||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS | ||
|
||
return { "YOUTUBE_API_KEY": YOUTUBE_API_KEY, | ||
"CLIENT_ID": CLIENT_ID, | ||
"CLIENT_SECRET": CLIENT_SECRET, | ||
"GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from youtube_transcript_api import YouTubeTranscriptApi | ||
from googleapiclient.discovery import build | ||
import oauth | ||
import json | ||
import os | ||
|
||
# Get credentials from environment variables | ||
env_vars = oauth.import_env_vars() | ||
YOUTUBE_API_KEY = env_vars.get("YOUTUBE_API_KEY") | ||
CLIENT_ID = env_vars.get("CLIENT_ID") | ||
CLIENT_SECRET = env_vars.get("CLIENT_SECRET") | ||
GOOGLE_APPLICATION_CREDENTIALS= env_vars.get("GOOGLE_APPLICATION_CREDENTIALS") | ||
|
||
def get_latest_videos(channel_id, max_results=5): | ||
|
||
""" | ||
Fetches the latest x-number of videos from a YouTube channel. | ||
Args: | ||
channel_id (str): The ID of the YouTube channel to monitor. | ||
max_results (int): The maximum number of latest videos to fetch. Default is 5. | ||
Returns: | ||
list: A list of video IDs for the latest videos. | ||
""" | ||
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) | ||
|
||
# Fetch channel details to get the ID of the uploads playlist | ||
request = youtube.channels().list( | ||
part='contentDetails', | ||
id=channel_id | ||
) | ||
response = request.execute() | ||
|
||
if not response.get('items'): | ||
raise ValueError(f"No channel found with ID {channel_id}") | ||
|
||
playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads'] | ||
|
||
request = youtube.playlistItems().list( | ||
part='snippet', | ||
playlistId=playlist_id, | ||
maxResults=max_results | ||
) | ||
response = request.execute() | ||
|
||
video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']] | ||
|
||
return video_ids | ||
|
||
def download_transcripts(video_ids): | ||
for video_id in video_ids: | ||
try: | ||
# Grabs transcript for the video | ||
transcript = YouTubeTranscriptApi.get_transcript(video_id) | ||
print(transcript) | ||
with open(f'transcripts-data\\YT_transcripts\\{video_id}_transcript.json', 'w+', encoding='utf-8') as file: | ||
json.dump(transcript, file) | ||
|
||
print(f'Transcript for {video_id} saved successfully.') | ||
|
||
except Exception as e: | ||
print(f'An error occurred while fetching the transcript for {video_id}: {e}') | ||
|
||
channel_id = "UC8oPEsQe9a0v6TdJ4K_QXoA" | ||
video_ids = get_latest_videos(channel_id, 10) | ||
download_transcripts(video_ids) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# HF Whisper Transcript App | ||
Application of [OpenAI Whisper-V2](https://huggingface.co/openai/whisper-large-v2) for audio file transcription. | ||
|
||
|
||
## To Run | ||
Configure [README.md]('README.md') | ||
```yml | ||
model: | ||
#model size | ||
#tiny, base, small, medium, large, large_v2 | ||
size: "tiny" | ||
# device for pytorch processing | ||
device: "cpu" | ||
# chunk length for audio processing | ||
chunk_length: "10" | ||
# batch size | ||
batch_size: 1 | ||
audio: | ||
# path to audio file to process | ||
path: "audio/trial_meeting.mp3" | ||
transcript: | ||
# location to save transcript | ||
save_loc: "transcripts/trial_meeting_transcript.txt" | ||
``` | ||
Execute from CL: | ||
```bash | ||
python transcribe.py transcribe_config.yml | ||
``` |
Oops, something went wrong.
565d006
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
sawt – ./
sawt-git-prod-eye-on-surveillance-team.vercel.app
www.sawt.us
sawt.eyeonsurveillance.org
sawt-eye-on-surveillance-team.vercel.app
sawt.us