====== Mining Tweets ====== import re, json, codecs import pandas as pd from collections import Counter, OrderedDict #read the data in into an array tweets_data_path = 'tweets.json' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweets_data.append(json.loads(line)) except: continue # print len(tweets_data) # return True if keyword is found in the text def word_in_text(word, text): word = word.lower() text = text.lower() match = re.search(word, text) if match: return True return False # print tweets_data[0].keys() # write all tweets to a txt file (to keep as an intermediary step and for mining with other software) fileOut = codecs.open("architecture-tweets.txt", "w", encoding="utf-8-sig") texts = [tweet['text'] for tweet in tweets_data] for txt in texts: fileOut.write(txt.encode('ascii','ignore')) fileOut.close() fileIn = codecs.open(r"architecture-tweets.txt", "r", encoding="utf-8-sig") wordcount = Counter(fileIn.read().split()) for item in wordcount.items(): print("{}\t{}".format(*item)) # DATAFRAME ## create a dataframe tweets = pd.DataFrame() ## add three columns to the dataframe tweets['text'] = map(lambda tweet: tweet['text'], tweets_data) tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data) tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data) keywords = ('architecture','#architecture','Architecture','ARCHITECTURE') tweets[keywords[0]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[0], tweet)) tweets[keywords[1]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[1], tweet)) tweets[keywords[2]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[2], tweet)) tweets[keywords[3]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[3], tweet)) relevant = ('computer','computational','computation','digital','3D','4D','CAD','computer-aided') tweets[relevant[0]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet)) tweets[relevant[1]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[1], tweet)) tweets[relevant[2]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[2], tweet)) tweets[relevant[3]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[3], tweet)) tweets[relevant[4]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[4], tweet)) tweets[relevant[5]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[5], tweet)) tweets[relevant[6]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[6], tweet)) tweets[relevant[7]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[7], tweet)) tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet) or word_in_text(relevant[1], tweet) or word_in_text(relevant[2], tweet) or word_in_text(relevant[3], tweet) or word_in_text(relevant[4], tweet) or word_in_text(relevant[5], tweet) or word_in_text(relevant[6], tweet) or word_in_text(relevant[7], tweet)) def extract_link(text): regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' match = re.search(regex, text) if match: return match.group() return '' def extract_txt(text): regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' match = re.search(regex, text) if match: return match.group() return '' tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))