User Tools

Site Tools


Mining Tweets

import re, json, codecs
import pandas as pd
from collections import Counter, OrderedDict

#read the data in into an array
tweets_data_path = 'tweets.json'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:

# print len(tweets_data)

# return True if keyword is found in the text
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match =, text)
    if match:
        return True
    return False

# print tweets_data[0].keys()
# write all tweets to a txt file (to keep as an intermediary step and for mining with other software)
fileOut ="architecture-tweets.txt", "w", encoding="utf-8-sig")
texts = [tweet['text'] for tweet in tweets_data]
for txt in texts:

fileIn ="architecture-tweets.txt", "r", encoding="utf-8-sig")

wordcount = Counter(
for item in wordcount.items(): print("{}\t{}".format(*item))

## create a dataframe
tweets = pd.DataFrame()

## add three columns to the dataframe
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)

keywords = ('architecture','#architecture','Architecture','ARCHITECTURE')

tweets[keywords[0]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[0], tweet))
tweets[keywords[1]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[1], tweet))
tweets[keywords[2]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[2], tweet))
tweets[keywords[3]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[3], tweet))

relevant = ('computer','computational','computation','digital','3D','4D','CAD','computer-aided')

tweets[relevant[0]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet))
tweets[relevant[1]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[1], tweet))
tweets[relevant[2]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[2], tweet))
tweets[relevant[3]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[3], tweet))
tweets[relevant[4]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[4], tweet))
tweets[relevant[5]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[5], tweet))
tweets[relevant[6]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[6], tweet))
tweets[relevant[7]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[7], tweet))

tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet) or word_in_text(relevant[1], tweet) or word_in_text(relevant[2], tweet) or word_in_text(relevant[3], tweet) or word_in_text(relevant[4], tweet) or word_in_text(relevant[5], tweet) or word_in_text(relevant[6], tweet) or word_in_text(relevant[7], tweet))

def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match =, text)
    if match:
    return ''

def extract_txt(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match =, text)
    if match:
    return ''

tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))
mine-tweets-py.txt · Last modified: 2017/01/12 14:06 by zoza