import tweepy
import time
import pandas as pdIn [1]:
In [2]:
consumer_key = "add_key"
consumer_secret = "add_key"
access_key = "add_key"
access_secret = "add_key"
bearer_token = "add_key"In [3]:
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
In [4]:
raw_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-28T00:00:00Z',
end_time = '2022-11-30T00:00:00Z'
):
time.sleep(1)
raw_tweets.append(response)In [5]:
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-08-24T08:23:52Z',
end_time = '2022-11-30T00:00:00Z'
):
time.sleep(1)
raw_tweets.append(response)In [6]:
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-09-22T06:14:34Z',
end_time = '2022-11-30T00:00:00Z'
):
time.sleep(1)
raw_tweets.append(response)In [7]:
len(raw_tweets) In [8]:
raw_tweets2 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-28T00:00:00Z',
end_time = '2022-08-24T08:23:52Z'
):
time.sleep(1)
raw_tweets2.append(response)In [9]:
raw_tweets3 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-28T00:00:00Z',
end_time = '2022-06-15T20:53:28Z'
):
time.sleep(1)
raw_tweets3.append(response)In [10]:
raw_tweets4 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-28T00:00:00Z',
end_time = '2022-03-04T05:46:41Z'
):
time.sleep(1)
raw_tweets4.append(response)In [11]:
raw_tweets5 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-24T23:59:59Z',
end_time = '2022-03-01T08:36:47Z'
):
time.sleep(1)
raw_tweets5.append(response)In [12]:
result = []
user_dict = {}
# Loop through each response object
for response in raw_tweets:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df = pd.DataFrame(result)In [13]:
result2 = []
user_dict2 = {}
# Loop through each response object
for response in raw_tweets2:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result2.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df2 = pd.DataFrame(result2)In [14]:
df2.to_csv('uk_tag4.csv', encoding='utf-8')In [15]:
len(raw_tweets)In [16]:
result3 = []
user_dict3 = {}
# Loop through each response object
for response in raw_tweets3:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result3.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df3 = pd.DataFrame(result3)In [17]:
df3.to_csv('uk_tag5.csv', encoding='utf-8')In [18]:
result4 = []
user_dict4 = {}
# Loop through each response object
for response in raw_tweets4:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result4.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result4)In [19]:
df4.to_csv('uk_tag6.csv', encoding='utf-8')In [20]:
result5 = []
user_dict5 = {}
# Loop through each response object
for response in raw_tweets5:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result5.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df5 = pd.DataFrame(result5)In [21]:
df5.to_csv('uk_tag7.csv', encoding='utf-8')In [22]:
raw_tweets6 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-02-24T00:00:00Z',
end_time = '2022-02-25T00:00:58Z'
):
time.sleep(1)
raw_tweets6.append(response)In [23]:
result6 = []
user_dict6 = {}
# Loop through each response object
for response in raw_tweets6:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result6.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df6 = pd.DataFrame(result6)In [24]:
df6.to_csv('uk_tag8.csv', encoding='utf-8')In [25]:
raw_tweets7 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-08-24T08:23:16Z',
end_time = '2022-09-22T06:14:34Z'
):
time.sleep(1)
raw_tweets7.append(response)In [26]:
result7 = []
user_dict7 = {}
# Loop through each response object
for response in raw_tweets7:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result7.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df7 = pd.DataFrame(result7)In [27]:
df7.to_csv('uk_tag2.csv', encoding='utf-8')In [28]:
raw_tweets8 = []
for response in tweepy.Paginator(client.search_all_tweets,
query = '"ukraine krieg" -is:retweet lang:de',
tweet_fields = ['created_at', 'geo', 'text'],
start_time = '2022-11-29T23:57:32Z',
end_time = '2022-11-30T23:59:59Z'
):
time.sleep(1)
raw_tweets8.append(response)In [29]:
result8 = []
user_dict8 = {}
# Loop through each response object
for response in raw_tweets8:
for tweet in response.data:
# Put all of the information we want to keep in a single dictionary for each tweet
result8.append({
'text': tweet.text,
'created_at': tweet.created_at,
'geo': tweet.geo
})
# Change this list of dictionaries into a dataframe
df8 = pd.DataFrame(result8)In [30]:
df8.to_csv('uk_tag1.csv', encoding='utf-8')In [31]:
# Added from differerent notebook the example preparation and analysis of the first week In [32]:
import nltk
import germansentiment
from nltk.probability import FreqDist
from textblob_de import TextBlobDE
import csv
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from textblob_de import TextBlobDE as TextBlobIn [33]:
with open('/Users/lisa/Final_Project_geopolitcs/first_week.csv', 'r') as csv_datei:
reader = csv.reader(csv_datei, delimiter=',')
text = csv_datei.read()
token_text = sent_tokenize(text)
words = text.split()In [34]:
lowercase_words = []
for w in words: lowercase_words.append(w.lower())
print(len(lowercase_words)) In [35]:
def remove_punc(string):
punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
for ele in string:
if ele in punc:
string = string.replace(ele, "")
return string
lowercase_words_clean = [remove_punc(i) for i in lowercase_words]In [36]:
tweet_words = []
for word in lowercase_words_clean:
if word.startswith('http'):
word = "http"
elif word.startswith('@'):
word = '@user'
elif word:
word = word
tweet_words.append(word)In [37]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('german')In [38]:
other_stopwords = ["ukraine", "ukraine-krieg", "mehr", "-", "via", "+++", "ukrainekrieg", "krieg", "tonline", "@user", "http"]
print(other_stopwords)In [39]:
words_withoutstop = []
for word in tweet_words :
if word not in stopwords:
words_withoutstop.append(word)
fdist = FreqDist(words_withoutstop)
fdist.plot(10)
print(len(words), len(words_withoutstop))In [40]:
words_withoutstop2 = []
for word in words_withoutstop:
if word not in other_stopwords:
words_withoutstop2.append(word)
fdist2 = FreqDist(words_withoutstop2)
fdist2.plot(10)
print(len(words_withoutstop), len(words_withoutstop2))In [41]:
string_firstweek = ''
for x in words_withoutstop2:
string_firstweek += ' ' + xIn [42]:
wc = WordCloud(scale=3,
colormap='Paired',
background_color='white')
wc.generate(string_firstweek)
plt.imshow(wc)
wc.to_file('wordcloud_firstweek.png')
plt.axis("off")In [43]:
blob = TextBlob(string_firstweek)
print(blob.sentiment)
sentiment_mw = blob.sentiment