Skip to content

Commit

Permalink
Merge pull request #183 from eye-on-surveillance/main
Browse files Browse the repository at this point in the history
release: focus group 2
  • Loading branch information
marvinmarnold authored Dec 7, 2023
2 parents 36e9b65 + ccf57b2 commit 565d006
Show file tree
Hide file tree
Showing 28 changed files with 1,796 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

Work in Progress

## Tulane Dev Integration Branch

Sawt is a tool designed to bridge the communication gap between New Orleanians and their city council representatives.

## Prerequisites
Expand Down
3 changes: 3 additions & 0 deletions packages/googlecloud/functions/getanswer/inquirer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def get_indepth_response_from_query(df, db, query, k):
query = transform_query_for_date(query)

doc_list = db.similarity_search_with_score(query, k=k)

docs = sort_retrived_documents(doc_list)

docs_page_content = append_metadata_to_content(doc_list)

template = """
Expand Down Expand Up @@ -245,3 +247,4 @@ def answer_query(
final_response = route_question(df, db_general, db_in_depth, query, response_type)

return final_response

9 changes: 8 additions & 1 deletion packages/googlecloud/functions/getanswer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import google.cloud.logging
import functions_framework
from supabase import create_client

from dotenv import find_dotenv, load_dotenv
from helper import parse_field, get_dbs
from inquirer import answer_query
import os
Expand All @@ -19,6 +19,9 @@
db_general, db_in_depth, voting_roll_df = get_dbs()

# Setup Supabase client
load_dotenv(find_dotenv())


try:
supabase_url = os.environ["SUPABASE_URL_PRODUCTION"]
supabase_key = os.environ["SUPABASE_SERVICE_KEY_PRODUCTION"]
Expand Down Expand Up @@ -115,8 +118,12 @@ def getanswer(request):

end = time.time()
elapsed = int((end - start) * 1000)

update_supabase(responses_data, citations_data, card_id, elapsed)
logging.info(f"Completed getanswer in {elapsed} seconds")
print(f"\n\t--------- Completed getanswer in {elapsed} seconds --------\n")

return ("Answer successfully submitted to Supabase", 200, headers)



50 changes: 50 additions & 0 deletions packages/googlecloud/functions/getanswer/process_public_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import numpy as np
import requests
import csv
import json
from tqdm import tqdm

# Input CSV file with 'title' column
input_csv = "/Users/haydenoutlaw/Desktop/card_rows_export_2023-11-29.csv"
output_csv = "/Users/haydenoutlaw/Desktop/gpt4-varied-11-29.csv"

# point to getanswer server
api_endpoint = "http://localhost:8080"

# list of k values
k_list = [5, 10, 15]

# get response from local getanswer server, store answers
def make_api_call(title, k_inp):
payload = {"query": title, "response_type": "in_depth", "card_id": 1, "k": k_inp}
response = requests.post(f"{api_endpoint}", json=payload)
rdict = json.loads(response.text)
card_type_out = rdict["card_type"]
citations_out = rdict["citations"]
responses_out = rdict["responses"]
return card_type_out, citations_out, responses_out, k_inp

# Open CSV file in append mode
with open(output_csv, 'a', newline='', encoding='utf-8') as csv_file:
# define csv out file
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["query", "response_id", "card_type", "citations", "responses", "k"])

# read inputs
df = pd.read_csv(input_csv)


print("Connected to getanswer at", api_endpoint)
print("K Values", k_list)
print("Generating Responses....")


# for all queries, get answers and write out one at a time
tqiter = enumerate(tqdm(df["title"]))
for i, query in tqiter:
for k_val in k_list:
card_type, citations, responses, k = make_api_call(query, k_val)
csv_writer.writerow([query, i, card_type, citations, responses, k])

print(f"Results saved to '{output_csv}'.")
8 changes: 8 additions & 0 deletions packages/transcription/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.env
.log
__pycache__/
transcripts-data/
audio/
cred/
.vscode/

19 changes: 19 additions & 0 deletions packages/transcription/transcribe/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## TU Capstone- Transcription

A generic API for fetching YouTube Audio and Transcripts.

#### Required Credentials
- YOUTUBE_API_KEY
- GOOGLE_APPLICATION_CREDENTIALS
Create a cred folder containing cred.env variables according to dotenv configuration.

### transcripts.py
Retrieves & downloads the x-most recent video transcripts from a YouTube Channel.

### monitor.py
Retrieves & downloads the x-most recent video audio mp4s from a YouTube Channel. Future implemention should consider using Windows Task Scheduler to periodically monitor channel for new videos.

#### Oauth.py
Helper authentication function.


71 changes: 71 additions & 0 deletions packages/transcription/transcribe/monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from googleapiclient.discovery import build
#import youtube_dl Has BEEN DEPRECATED BY GERMAN GOVERNMENT
import os
from dotenv import load_dotenv
from pytube import YouTube
import oauth
# Initialize the YouTube Data API client

env_vars = oauth.import_env_vars()
YOUTUBE_API_KEY = env_vars.get('YOUTUBE_API_KEY')
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

# Specify the YouTube channel ID
channel_id = 'UC8oPEsQe9a0v6TdJ4K_QXoA' # New Orleans City Council

def get_latest_videos(channel_id, max_results=5):
"""
Fetches the latest x-number of videos from a YouTube channel.
Args:
channel_id (str): The ID of the YouTube channel to monitor.
max_results (int): The maximum number of latest videos to fetch. Default is 5.
Returns:
list: A list of video IDs for the latest videos.
"""
# Fetch channel details to get the ID of the uploads playlist
request = youtube.channels().list(
part='contentDetails',
id=channel_id
)
response = request.execute()

if not response.get('items'):
raise ValueError(f"No channel found with ID {channel_id}")

playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

request = youtube.playlistItems().list(
part='snippet',
playlistId=playlist_id,
maxResults=max_results
)
response = request.execute()

video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']]

return video_ids

def download_audio(video_ids):
"""
Downloads the audio of a list of YouTube videos using pytube.
Args:
video_ids (list): A list of YouTube video IDs to download the audio for.
Downloads: mp4 audio files of the desired Youtube videos.
"""
for video_id in video_ids:
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
ys = yt.streams.filter(only_audio=True).first()

# Download the audio stream to the specified output path
print(f'Downloading audio for {video_id}...')
ys.download(output_path=r'transcripts-data\audio', filename=video_id+".mp4")

# Get the latest videos
video_ids = get_latest_videos(channel_id, 10)

# Download the audio of the new videos
download_audio(video_ids)
21 changes: 21 additions & 0 deletions packages/transcription/transcribe/oauth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

import os
from dotenv import load_dotenv

def import_env_vars():
os.chdir(r"packages\transcription")
load_dotenv(r"cred\cred.env")

# Get credentials from environment variables
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
GOOGLE_APPLICATION_CREDENTIALS= os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS

return { "YOUTUBE_API_KEY": YOUTUBE_API_KEY,
"CLIENT_ID": CLIENT_ID,
"CLIENT_SECRET": CLIENT_SECRET,
"GOOGLE_APPLICATION_CREDENTIALS": GOOGLE_APPLICATION_CREDENTIALS
}
67 changes: 67 additions & 0 deletions packages/transcription/transcribe/transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
import oauth
import json
import os

# Get credentials from environment variables
env_vars = oauth.import_env_vars()
YOUTUBE_API_KEY = env_vars.get("YOUTUBE_API_KEY")
CLIENT_ID = env_vars.get("CLIENT_ID")
CLIENT_SECRET = env_vars.get("CLIENT_SECRET")
GOOGLE_APPLICATION_CREDENTIALS= env_vars.get("GOOGLE_APPLICATION_CREDENTIALS")

def get_latest_videos(channel_id, max_results=5):

"""
Fetches the latest x-number of videos from a YouTube channel.
Args:
channel_id (str): The ID of the YouTube channel to monitor.
max_results (int): The maximum number of latest videos to fetch. Default is 5.
Returns:
list: A list of video IDs for the latest videos.
"""
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

# Fetch channel details to get the ID of the uploads playlist
request = youtube.channels().list(
part='contentDetails',
id=channel_id
)
response = request.execute()

if not response.get('items'):
raise ValueError(f"No channel found with ID {channel_id}")

playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

request = youtube.playlistItems().list(
part='snippet',
playlistId=playlist_id,
maxResults=max_results
)
response = request.execute()

video_ids = [item['snippet']['resourceId']['videoId'] for item in response['items']]

return video_ids

def download_transcripts(video_ids):
for video_id in video_ids:
try:
# Grabs transcript for the video
transcript = YouTubeTranscriptApi.get_transcript(video_id)
print(transcript)
with open(f'transcripts-data\\YT_transcripts\\{video_id}_transcript.json', 'w+', encoding='utf-8') as file:
json.dump(transcript, file)

print(f'Transcript for {video_id} saved successfully.')

except Exception as e:
print(f'An error occurred while fetching the transcript for {video_id}: {e}')

channel_id = "UC8oPEsQe9a0v6TdJ4K_QXoA"
video_ids = get_latest_videos(channel_id, 10)
download_transcripts(video_ids)
28 changes: 28 additions & 0 deletions packages/transcription/whisper-model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# HF Whisper Transcript App
Application of [OpenAI Whisper-V2](https://huggingface.co/openai/whisper-large-v2) for audio file transcription.


## To Run
Configure [README.md]('README.md')
```yml
model:
#model size
#tiny, base, small, medium, large, large_v2
size: "tiny"
# device for pytorch processing
device: "cpu"
# chunk length for audio processing
chunk_length: "10"
# batch size
batch_size: 1
audio:
# path to audio file to process
path: "audio/trial_meeting.mp3"
transcript:
# location to save transcript
save_loc: "transcripts/trial_meeting_transcript.txt"
```
Execute from CL:
```bash
python transcribe.py transcribe_config.yml
```
Loading

1 comment on commit 565d006

@vercel
Copy link

@vercel vercel bot commented on 565d006 Dec 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.