-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathyoutube_profanity_bot.py
119 lines (107 loc) · 5.65 KB
/
youtube_profanity_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from textblob import TextBlob
from better_profanity import profanity
import os
from time import sleep
PATH = 'C:\Program Files (x86)\chromedriver.exe' #provide path to chromedriver.exe here.
class ProfanityBot:
def __init__(self, channel_url, vid_cnt, del_transcript= True):
self.driver = webdriver.Chrome(PATH)
self.channel_url = channel_url
self.channel_name = ''
self.vid_cnt = vid_cnt #the number of videos to be fetched from channel.
self.del_transcript = del_transcript
def fetch_transcripts(self):
self.driver.get(self.channel_url+"/videos") #to jump on videos section.
self.channel_name = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@id='inner-header-container']//div[@id='text-container']"))).text
links = WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.ID, "video-title")))
print("Started process..")
print("Scraping transcripts from {} channel...".format(self.channel_name))
toggle = True
try:
count = 0
links_list = []
# got all links for given channel
for item in links[0:self.vid_cnt]:
links_list.append(item.get_attribute('href'))
#looping through each link and fetching transcripts.
for link in links_list:
self.driver.get(link)
opt_btn = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//body/ytd-app/div[@id='content']/ytd-page-manager[@id='page-manager']/ytd-watch-flexy[@class='style-scope ytd-page-manager hide-skeleton']/div[@id='columns']/div[@id='primary']/div[@id='primary-inner']/div[@id='info']/div[@id='info-contents']/ytd-video-primary-info-renderer[@class='style-scope ytd-watch-flexy']/div[@id='container']/div[@id='info']/div[@id='menu-container']/div[@id='menu']/ytd-menu-renderer[@class='style-scope ytd-video-primary-info-renderer']/yt-icon-button[@id='button']/button[1]")))
opt_btn.click()
# if transcript btn doesn't exit for a video or if transcipts section doesn't show up.
try:
transcript_btn = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-popup-container.style-scope.ytd-app:nth-child(13) iron-dropdown.style-scope.ytd-popup-container:nth-child(1) div.style-scope.iron-dropdown ytd-menu-popup-renderer.style-scope.ytd-popup-container paper-listbox.style-scope.ytd-menu-popup-renderer:nth-child(1) > ytd-menu-service-item-renderer.style-scope.ytd-menu-popup-renderer:nth-child(2)")))
transcript_btn.click()
sleep(2)
# to toggle timestamp we just need to run this once.
if toggle:
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//ytd-menu-renderer[@class='style-scope ytd-engagement-panel-title-header-renderer']//button[@id='button']"))).click()
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//ytd-menu-service-item-renderer[@class='style-scope ytd-menu-popup-renderer']"))).click()
toggle = False #once toggled, no need to toggle again.
transcript_text = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-transcript-body-renderer"))).text
count += 1
except:
count += 1
print(f"no transcipt found for {count}th video!")
continue #will go for next iteration of current for loop.
# to append transcript text to a file
filename = f'{self.channel_name}.txt'
filepath = os.path.join("scraped_transcripts", filename)
with open(filepath, 'a') as file:
file.write(transcript_text)
print(f'{count}th video done.')
except Exception as e:
print(e)
finally:
self.driver.quit()
print(f'Done scraping transcripts of {self.vid_cnt} videos from {self.channel_name} channel.')
print('---------------------------------------------------------------------------------')
#sending scraped transcript for sentiment and profanity analysis.
self.profanity_analysis(filepath)
self.sentiment_analysis(filepath)
# deleting transcripts after use.
if self.del_transcript == True:
os.remove(filepath)
# Profanity analysis
def profanity_analysis(self, filepath):
curse_word_count = 0
total_word_count = 0
curse_words = []
with open(filepath, 'r') as file:
text = file.read()
sentences = text.split('\n')
for sentence in sentences:
word_list = sentence.split(' ')
for word in word_list:
if profanity.contains_profanity(word): #returns True if word is curse/swear/profane word.
curse_words.append(word)
curse_word_count += 1
total_word_count += 1
profanity_percentage = (curse_word_count/total_word_count) * 100
print(f'Curse words used: {curse_words}')
print(f'Profanity/curse/swear percentage: {round(profanity_percentage, 5)}%')
# Sentiment analysis
def sentiment_analysis(self, filepath):
pos_count = 0
neg_count = 0
total_count = 0
with open(filepath, 'r') as file:
text = file.read()
subjectivity = TextBlob(text).subjectivity
sentence_li = text.split('\n')
for sentence in sentence_li:
blob = TextBlob(sentence)
if blob.sentiment.polarity >= 0.1:
pos_count += 1
elif blob.sentiment.polarity < 0:
neg_count += 1
total_count += 1
pos_sentiment = (pos_count/total_count) * 100
neg_sentiment = (neg_count/total_count) * 100
print(f'Positive sentiment: {round(pos_sentiment, 4)}%\nNegative sentiment: {round(neg_sentiment, 4)}%\nSubjectivity: {round(subjectivity, 4)}')
obj = ProfanityBot('https://www.youtube.com/channel/UC3DkFux8Iv-aYnTRWzwaiBA', 7)
obj.fetch_transcripts()