This shows you the differences between two versions of the page.
— |
mine-tweets-py [2017/01/12 14:06] (current) zoza created |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Mining Tweets ====== | ||
+ | <code python> | ||
+ | import re, json, codecs | ||
+ | import pandas as pd | ||
+ | from collections import Counter, OrderedDict | ||
+ | |||
+ | #read the data in into an array | ||
+ | tweets_data_path = 'tweets.json' | ||
+ | |||
+ | tweets_data = [] | ||
+ | tweets_file = open(tweets_data_path, "r") | ||
+ | for line in tweets_file: | ||
+ | try: | ||
+ | tweets_data.append(json.loads(line)) | ||
+ | except: | ||
+ | continue | ||
+ | |||
+ | # print len(tweets_data) | ||
+ | |||
+ | # return True if keyword is found in the text | ||
+ | def word_in_text(word, text): | ||
+ | word = word.lower() | ||
+ | text = text.lower() | ||
+ | match = re.search(word, text) | ||
+ | if match: | ||
+ | return True | ||
+ | return False | ||
+ | |||
+ | # print tweets_data[0].keys() | ||
+ | # write all tweets to a txt file (to keep as an intermediary step and for mining with other software) | ||
+ | fileOut = codecs.open("architecture-tweets.txt", "w", encoding="utf-8-sig") | ||
+ | texts = [tweet['text'] for tweet in tweets_data] | ||
+ | for txt in texts: | ||
+ | fileOut.write(txt.encode('ascii','ignore')) | ||
+ | fileOut.close() | ||
+ | |||
+ | fileIn = codecs.open(r"architecture-tweets.txt", "r", encoding="utf-8-sig") | ||
+ | |||
+ | wordcount = Counter(fileIn.read().split()) | ||
+ | for item in wordcount.items(): print("{}\t{}".format(*item)) | ||
+ | |||
+ | # DATAFRAME | ||
+ | ## create a dataframe | ||
+ | tweets = pd.DataFrame() | ||
+ | |||
+ | ## add three columns to the dataframe | ||
+ | tweets['text'] = map(lambda tweet: tweet['text'], tweets_data) | ||
+ | tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data) | ||
+ | tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data) | ||
+ | |||
+ | keywords = ('architecture','#architecture','Architecture','ARCHITECTURE') | ||
+ | |||
+ | tweets[keywords[0]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[0], tweet)) | ||
+ | tweets[keywords[1]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[1], tweet)) | ||
+ | tweets[keywords[2]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[2], tweet)) | ||
+ | tweets[keywords[3]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[3], tweet)) | ||
+ | |||
+ | relevant = ('computer','computational','computation','digital','3D','4D','CAD','computer-aided') | ||
+ | |||
+ | tweets[relevant[0]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet)) | ||
+ | tweets[relevant[1]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[1], tweet)) | ||
+ | tweets[relevant[2]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[2], tweet)) | ||
+ | tweets[relevant[3]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[3], tweet)) | ||
+ | tweets[relevant[4]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[4], tweet)) | ||
+ | tweets[relevant[5]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[5], tweet)) | ||
+ | tweets[relevant[6]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[6], tweet)) | ||
+ | tweets[relevant[7]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[7], tweet)) | ||
+ | |||
+ | tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet) or word_in_text(relevant[1], tweet) or word_in_text(relevant[2], tweet) or word_in_text(relevant[3], tweet) or word_in_text(relevant[4], tweet) or word_in_text(relevant[5], tweet) or word_in_text(relevant[6], tweet) or word_in_text(relevant[7], tweet)) | ||
+ | |||
+ | def extract_link(text): | ||
+ | regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' | ||
+ | match = re.search(regex, text) | ||
+ | if match: | ||
+ | return match.group() | ||
+ | return '' | ||
+ | |||
+ | def extract_txt(text): | ||
+ | regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' | ||
+ | match = re.search(regex, text) | ||
+ | if match: | ||
+ | return match.group() | ||
+ | return '' | ||
+ | |||
+ | tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet)) | ||
+ | |||
+ | </code> |