tweet_scraping_demo_code – Lisa Götschi

In [1]:

import tweepy
import time
import pandas as pd

In [2]:

consumer_key = "add_key"
consumer_secret = "add_key"
access_key = "add_key"
access_secret = "add_key"
bearer_token = "add_key"

In [3]:

client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

In [4]:

raw_tweets = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-28T00:00:00Z',
                                 end_time = '2022-11-30T00:00:00Z'
                                  ):
    time.sleep(1)
    raw_tweets.append(response)

In [5]:

for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-08-24T08:23:52Z',
                                 end_time = '2022-11-30T00:00:00Z'
                                  ):
    time.sleep(1)
    raw_tweets.append(response)

In [6]:

for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-09-22T06:14:34Z',
                                 end_time = '2022-11-30T00:00:00Z'
                                  ):
    time.sleep(1)
    raw_tweets.append(response)

In [7]:

len(raw_tweets)

In [8]:

raw_tweets2 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-28T00:00:00Z',
                                 end_time = '2022-08-24T08:23:52Z'
                                  ):
    time.sleep(1)
    raw_tweets2.append(response)

In [9]:

raw_tweets3 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-28T00:00:00Z',
                                 end_time = '2022-06-15T20:53:28Z'
                                  ):
    time.sleep(1)
    raw_tweets3.append(response)

In [10]:

raw_tweets4 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-28T00:00:00Z',
                                 end_time = '2022-03-04T05:46:41Z'
                                  ):
    time.sleep(1)
    raw_tweets4.append(response)

In [11]:

raw_tweets5 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-24T23:59:59Z',
                                 end_time = '2022-03-01T08:36:47Z'
                                  ):
    time.sleep(1)
    raw_tweets5.append(response)

In [12]:

result = []
user_dict = {}
# Loop through each response object
for response in raw_tweets:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df = pd.DataFrame(result)

In [13]:

result2 = []
user_dict2 = {}
# Loop through each response object
for response in raw_tweets2:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result2.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df2 = pd.DataFrame(result2)

In [14]:

df2.to_csv('uk_tag4.csv', encoding='utf-8')

In [15]:

len(raw_tweets)

In [16]:

result3 = []
user_dict3 = {}
# Loop through each response object
for response in raw_tweets3:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result3.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df3 = pd.DataFrame(result3)

In [17]:

df3.to_csv('uk_tag5.csv', encoding='utf-8')

In [18]:

result4 = []
user_dict4 = {}
# Loop through each response object
for response in raw_tweets4:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result4.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result4)

In [19]:

df4.to_csv('uk_tag6.csv', encoding='utf-8')

In [20]:

result5 = []
user_dict5 = {}
# Loop through each response object
for response in raw_tweets5:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result5.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df5 = pd.DataFrame(result5)

In [21]:

df5.to_csv('uk_tag7.csv', encoding='utf-8')

In [22]:

raw_tweets6 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-02-24T00:00:00Z',
                                 end_time = '2022-02-25T00:00:58Z'
                                  ):
    time.sleep(1)
    raw_tweets6.append(response)

In [23]:

result6 = []
user_dict6 = {}
# Loop through each response object
for response in raw_tweets6:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result6.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df6 = pd.DataFrame(result6)

In [24]:

df6.to_csv('uk_tag8.csv', encoding='utf-8')

In [25]:

raw_tweets7 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-08-24T08:23:16Z',
                                 end_time = '2022-09-22T06:14:34Z'
                                  ):
    time.sleep(1)
    raw_tweets7.append(response)

In [26]:

result7 = []
user_dict7 = {}
# Loop through each response object
for response in raw_tweets7:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result7.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df7 = pd.DataFrame(result7)

In [27]:

df7.to_csv('uk_tag2.csv', encoding='utf-8')

In [28]:

raw_tweets8 = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = '"ukraine krieg" -is:retweet lang:de',
                                 tweet_fields = ['created_at', 'geo', 'text'],
                                 start_time = '2022-11-29T23:57:32Z',
                                 end_time = '2022-11-30T23:59:59Z'
                                  ):
    time.sleep(1)
    raw_tweets8.append(response)

In [29]:

result8 = []
user_dict8 = {}
# Loop through each response object
for response in raw_tweets8:
    for tweet in response.data:
        # Put all of the information we want to keep in a single dictionary for each tweet
        result8.append({
                       'text': tweet.text,
                       'created_at': tweet.created_at,
                       'geo': tweet.geo
                      })

# Change this list of dictionaries into a dataframe
df8 = pd.DataFrame(result8)

In [30]:

df8.to_csv('uk_tag1.csv', encoding='utf-8')

In [31]:

# Added from differerent notebook the example preparation and analysis of the first week

In [32]:

import nltk
import germansentiment 
from nltk.probability import FreqDist
from textblob_de import TextBlobDE
import csv
import string 
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from textblob_de import TextBlobDE as TextBlob

In [33]:

with open('/Users/lisa/Final_Project_geopolitcs/first_week.csv', 'r') as csv_datei:
    reader = csv.reader(csv_datei, delimiter=',')
    text = csv_datei.read()
    token_text = sent_tokenize(text)
    words = text.split()

In [34]:

lowercase_words = []
for w in words: lowercase_words.append(w.lower()) 
print(len(lowercase_words))

In [35]:

def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string
 
lowercase_words_clean = [remove_punc(i) for i in lowercase_words]

In [36]:

tweet_words = []

for word in lowercase_words_clean:
    if word.startswith('http'):
        word = "http"
    elif word.startswith('@'):
        word = '@user'
    elif word: 
        word = word
        tweet_words.append(word)

In [37]:

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('german')

In [38]:

other_stopwords = ["ukraine", "ukraine-krieg", "mehr", "-", "via", "+++", "ukrainekrieg", "krieg", "tonline", "@user", "http"]
print(other_stopwords)

In [39]:

words_withoutstop = []
for word in tweet_words :
    if word not in stopwords:
        words_withoutstop.append(word)

fdist = FreqDist(words_withoutstop)
fdist.plot(10)

print(len(words), len(words_withoutstop))

In [40]:

words_withoutstop2 = []
for word in words_withoutstop:
    if word not in other_stopwords:
        words_withoutstop2.append(word)
fdist2 = FreqDist(words_withoutstop2)
fdist2.plot(10)


print(len(words_withoutstop), len(words_withoutstop2))

In [41]:

string_firstweek = ''
for x in words_withoutstop2:
    string_firstweek += ' ' + x

In [42]:

wc = WordCloud(scale=3,
                colormap='Paired',
                background_color='white')
wc.generate(string_firstweek)

plt.imshow(wc)
wc.to_file('wordcloud_firstweek.png')
plt.axis("off")

In [43]:

blob = TextBlob(string_firstweek)

print(blob.sentiment)

sentiment_mw = blob.sentiment

tweet_scraping_demo_code.ipynb