User Tools

Site Tools


mine-tweets-py

Mining Tweets

import re, json, codecs
import pandas as pd
from collections import Counter, OrderedDict

#read the data in into an array
tweets_data_path = 'tweets.json'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweets_data.append(json.loads(line))
    except:
        continue

# print len(tweets_data)

# return True if keyword is found in the text
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

# print tweets_data[0].keys()
# write all tweets to a txt file (to keep as an intermediary step and for mining with other software)
fileOut = codecs.open("architecture-tweets.txt", "w", encoding="utf-8-sig")
texts = [tweet['text'] for tweet in tweets_data]
for txt in texts:
    fileOut.write(txt.encode('ascii','ignore'))
fileOut.close()

fileIn = codecs.open(r"architecture-tweets.txt", "r", encoding="utf-8-sig")

wordcount = Counter(fileIn.read().split())
for item in wordcount.items(): print("{}\t{}".format(*item))

# DATAFRAME
## create a dataframe
tweets = pd.DataFrame()

## add three columns to the dataframe
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)

keywords = ('architecture','#architecture','Architecture','ARCHITECTURE')

tweets[keywords[0]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[0], tweet))
tweets[keywords[1]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[1], tweet))
tweets[keywords[2]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[2], tweet))
tweets[keywords[3]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[3], tweet))

relevant = ('computer','computational','computation','digital','3D','4D','CAD','computer-aided')

tweets[relevant[0]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet))
tweets[relevant[1]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[1], tweet))
tweets[relevant[2]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[2], tweet))
tweets[relevant[3]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[3], tweet))
tweets[relevant[4]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[4], tweet))
tweets[relevant[5]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[5], tweet))
tweets[relevant[6]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[6], tweet))
tweets[relevant[7]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[7], tweet))

tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet) or word_in_text(relevant[1], tweet) or word_in_text(relevant[2], tweet) or word_in_text(relevant[3], tweet) or word_in_text(relevant[4], tweet) or word_in_text(relevant[5], tweet) or word_in_text(relevant[6], tweet) or word_in_text(relevant[7], tweet))

def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

def extract_txt(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))
mine-tweets-py.txt · Last modified: 2017/01/12 14:06 by zoza