diff --git a/twitter-api-example.py b/twitter-api-example.py index 11e6c3264d920f46a4990a9a246f273d65cfffd1..cee9fddbcac40376df47fa3269848726ac07ac18 100755 --- a/twitter-api-example.py +++ b/twitter-api-example.py @@ -9,85 +9,97 @@ import json import logging logging.basicConfig(level=logging.INFO) -####input your credentials here -consumer_key = 'pcn6szyFLVnzxDclKazZ3tQPI' -consumer_secret = 'OGKptAjYnoh33mvPEkBPQgcApNXFysbCUx2CjQwjtHhr7Z9unO' -access_token = '1127995977963646977-vqZMlKE8vJLSFMWmL6J9ouEZBncPi1' -access_token_secret = 'xT6WMGkUZJULIMAwpG0AUWtOFYDdGMBYAFSHUZIo9NLYh' - -auth = tweepy.OAuthHandler(consumer_key, consumer_secret) -auth.set_access_token(access_token, access_token_secret) -api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) - -# a parte de baixo que é a realmente importante... -searchQuery = '((cota OR cotas OR universidade OR universidades) AND (racial OR raciais)) OR ((universidade OR universidades) AND (cota OR cotas)) ' - -# sugiro tirar o sincedate; assim fica menos feito o código. Eu também lembro de dar uns paus; e de qq forma -# a API só volta cerca de uma semana no passado mesmo - -sincedate = "2019-05-15" -untildate = "2019-05-16" -maxTweets = 10000000 -# Testar colocar esse limite. Só para eles não destruirem a cahve - -tweetsPerQry = 100 # this is the max the API permits - -# melhor parametrizar esse arquivos de saida -csvFile = open('cotas.csv', 'a') -jsonFile = open('cotas.json', 'a') -csvWriter = csv.writer(csvFile) - -# If results from a specific ID onwards are reqd, set since_id to that ID. -# else default to no lower limit, go as far back as API allows -sinceId = None - -# If results only below a specific ID are, set max_id to that ID. -# else default to no upper limit, start from the most recent tweet matching the search query. -# Python3 has 9223372036854775807 as max number -max_id = sys.maxsize +def search(api, sincedate, untildate, csvFile, jsonFile): + # a parte de baixo que é a realmente importante... + searchQuery = '((cota OR cotas OR universidade OR universidades) AND (racial OR raciais)) OR ((universidade OR universidades) AND (cota OR cotas)) ' + + maxTweets = 1000000 + + tweetsPerQry = 100 # this is the max the API permits + + csvWriter = csv.writer(csvFile) + + # If results from a specific ID onwards are reqd, set since_id to that ID. + # else default to no lower limit, go as far back as API allows + sinceId = None + + # If results only below a specific ID are, set max_id to that ID. + # else default to no upper limit, start from the most recent tweet matching the search query. + # Python3 has 9223372036854775807 as max number + max_id = sys.maxsize + + tweetCount = 0 + print("Downloading max {0} tweets".format(maxTweets)) + while tweetCount < maxTweets: + try: + if (max_id <= 0): + if (not sinceId): + new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry) + else: + new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, since_id=sinceId) + else: + if (not sinceId): + new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1)) + else: + new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1), since_id=sinceId) + if not new_tweets: + print("No more tweets found") + break + for tweet in new_tweets: + json.dump(tweet._json, jsonFile) + jsonFile.write('\n') + # não coloquei todos os campos no csv + csvWriter.writerow([ + tweet.created_at, + tweet.id, + tweet.in_reply_to_status_id, + tweet.in_reply_to_user_id, + tweet.in_reply_to_screen_name, + tweet.user.id, tweet.user.screen_name, + tweet.user.followers_count, + tweet.is_quote_status, + tweet.retweet_count, + tweet.favorite_count, + tweet.lang, + tweet.text.encode('utf-8')]) + tweetCount += len(new_tweets) + print("Downloaded {0} tweets".format(tweetCount)) + max_id = new_tweets[-1].id + except tweepy.TweepError as e: + print("some error : " + str(e)) + continue + + print ("Downloaded {0} tweets".format(tweetCount)) -#max_id = 1045463072670920704 +if __name__ == "__main__": -tweetCount = 0 -print("Downloading max {0} tweets".format(maxTweets)) -while tweetCount < maxTweets: - try: - if (max_id <= 0): - if (not sinceId): - new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry) - else: - new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, since_id=sinceId) - else: - if (not sinceId): - new_tweets = api.search(q=searchQuery, since = sincedate, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1)) - else: - new_tweets = api.search(q=searchQuery, until = untildate, count=tweetsPerQry, max_id=str(max_id - 1), since_id=sinceId) - if not new_tweets: - print("No more tweets found") - break - for tweet in new_tweets: - json.dump(tweet._json, jsonFile) - jsonFile.write('\n') - # não coloquei todos os campos no csv - csvWriter.writerow([ - tweet.created_at, - tweet.id, - tweet.in_reply_to_status_id, - tweet.in_reply_to_user_id, - tweet.in_reply_to_screen_name, - tweet.user.id, tweet.user.screen_name, - tweet.user.followers_count, - tweet.is_quote_status, - tweet.retweet_count, - tweet.favorite_count, - tweet.lang, - tweet.text.encode('utf-8')]) - tweetCount += len(new_tweets) - print("Downloaded {0} tweets".format(tweetCount)) - max_id = new_tweets[-1].id - except tweepy.TweepError as e: - print("some error : " + str(e)) - continue + ####input your credentials here + consumer_key = 'pcn6szyFLVnzxDclKazZ3tQPI' + consumer_secret = 'OGKptAjYnoh33mvPEkBPQgcApNXFysbCUx2CjQwjtHhr7Z9unO' + access_token = '1127995977963646977-vqZMlKE8vJLSFMWmL6J9ouEZBncPi1' + access_token_secret = 'xT6WMGkUZJULIMAwpG0AUWtOFYDdGMBYAFSHUZIo9NLYh' + + auth = tweepy.OAuthHandler(consumer_key, consumer_secret) + auth.set_access_token(access_token, access_token_secret) + api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + + # sugiro tirar o sincedate; assim fica menos feito o código. Eu também lembro de dar uns paus; e de qq forma + # a API só volta cerca de uma semana no passado mesmo + if( len(sys.argv) != 4): + print("Usage "+sys.argv[0]+" <sincedate> <untildate> <csvFile> <jsonFile>") + sys.exit(1) + # sincedate = "2019-05-15" + sincedate = sys.argv[1] + # untildate = "2019-05-16" + untildate = sys.argv[2] -print ("Downloaded {0} tweets".format(tweetCount)) + # melhor parametrizar esse arquivos de saida + # csvFile = open('cotas.csv', 'a') + # jsonFile = open('cotas.json', 'a') + + csvFile = open(sys.argv[3], 'a') + jsonFile = open(sys.argv[4], 'a') + search(api, sincedate, untildate, csvFile, jsonFile) + csvFile.close() + jsonFile.close()