diff --git a/.projectfile b/.projectfile new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/twitter-api-example.py b/twitter-api-example.py new file mode 100644 index 0000000000000000000000000000000000000000..e43fb94f9290e121dca3bd3e8861ba3b594b56d0 --- /dev/null +++ b/twitter-api-example.py @@ -0,0 +1,90 @@ +import sys +import jsonpickle +import os +import tweepy +import csv +import json +import logging +logging.basicConfig(level=logging.INFO) + +####input your credentials here +consumer_key = 'pcn6szyFLVnzxDclKazZ3tQPI' +consumer_secret = 'OGKptAjYnoh33mvPEkBPQgcApNXFysbCUx2CjQwjtHhr7Z9unO' +access_token = '1127995977963646977-vqZMlKE8vJLSFMWmL6J9ouEZBncPi1' +access_token_secret = 'xT6WMGkUZJULIMAwpG0AUWtOFYDdGMBYAFSHUZIo9NLYh' + +auth = tweepy.OAuthHandler(consumer_key, consumer_secret) +auth.set_access_token(access_token, access_token_secret) +api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + +# a parte de baixo que é a realmente importante... +searchQuery = '((cota OR cotas OR universidade OR universidades) AND (racial OR raciais)) OR ((universidade OR universidades) AND (cota OR cotas)) ' + +# sugiro tirar o sincedate; assim fica menos feito o código. Eu também lembro de dar uns paus; e de qq forma +# a API só volta cerca de uma semana no passado mesmo + +sincedate = "2018-11-20" +untildate = "2018-11-21" +maxTweets = 10000000 +# Testar colocar esse limite. Só para eles não destruirem a cahve + +tweetsPerQry = 100 # this is the max the API permits + +# melhor parametrizar esse arquivos de saida +csvFile = open('cotas.csv', 'a') +jsonFile = open('cotas.json', 'a') +csvWriter = csv.writer(csvFile) + +# If results from a specific ID onwards are reqd, set since_id to that ID. +# else default to no lower limit, go as far back as API allows +sinceId = None + +# If results only below a specific ID are, set max_id to that ID. +# else default to no upper limit, start from the most recent tweet matching the search query. +max_id = -1L + +#max_id = 1045463072670920704 + +tweetCount = 0 +print("Downloading max {0} tweets".format(maxTweets)) +while tweetCount < maxTweets: + try: + if (max_id <= 0): + if (not sinceId): + new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry) + else: + new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, since_id=sinceId) + else: + if (not sinceId): + new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1)) + else: + new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1), since_id=sinceId) + if not new_tweets: + print("No more tweets found") + break + for tweet in new_tweets: + json.dump(tweet._json, jsonFile) + jsonFile.write('\n') + # não coloquei todos os campos no csv + csvWriter.writerow([ + tweet.created_at, + tweet.id, + tweet.in_reply_to_status_id, + tweet.in_reply_to_user_id, + tweet.in_reply_to_screen_name, + tweet.user.id, tweet.user.screen_name, + tweet.user.followers_count, + tweet.is_quote_status, + tweet.retweet_count, + tweet.favorite_count, + tweet.lang, + tweet.text.encode('utf-8')]) + tweetCount += len(new_tweets) + print("Downloaded {0} tweets".format(tweetCount)) + max_id = new_tweets[-1].id + except tweepy.TweepError as e: + print("some error : " + str(e)) + continue + +print ("Downloaded {0} tweets".format(tweetCount)) +