User Tools

Site Tools


mine-tweets-py

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

mine-tweets-py [2017/01/12 14:06] (current)
zoza created
Line 1: Line 1:
 +====== Mining Tweets ======
  
 +<code python>
 +import re, json, codecs
 +import pandas as pd
 +from collections import Counter, OrderedDict
 +
 +#read the data in into an array
 +tweets_data_path = '​tweets.json'​
 +
 +tweets_data = []
 +tweets_file = open(tweets_data_path,​ "​r"​)
 +for line in tweets_file:​
 +    try:
 +        tweets_data.append(json.loads(line))
 +    except:
 +        continue
 +
 +# print len(tweets_data)
 +
 +# return True if keyword is found in the text
 +def word_in_text(word,​ text):
 +    word = word.lower()
 +    text = text.lower()
 +    match = re.search(word,​ text)
 +    if match:
 +        return True
 +    return False
 +
 +# print tweets_data[0].keys()
 +# write all tweets to a txt file (to keep as an intermediary step and for mining with other software)
 +fileOut = codecs.open("​architecture-tweets.txt",​ "​w",​ encoding="​utf-8-sig"​)
 +texts = [tweet['​text'​] for tweet in tweets_data]
 +for txt in texts:
 +    fileOut.write(txt.encode('​ascii','​ignore'​))
 +fileOut.close()
 +
 +fileIn = codecs.open(r"​architecture-tweets.txt",​ "​r",​ encoding="​utf-8-sig"​)
 +
 +wordcount = Counter(fileIn.read().split())
 +for item in wordcount.items():​ print("​{}\t{}"​.format(*item))
 +
 +# DATAFRAME
 +## create a dataframe
 +tweets = pd.DataFrame()
 +
 +## add three columns to the dataframe
 +tweets['​text'​] = map(lambda tweet: tweet['​text'​],​ tweets_data)
 +tweets['​lang'​] = map(lambda tweet: tweet['​lang'​],​ tweets_data)
 +tweets['​country'​] = map(lambda tweet: tweet['​place'​]['​country'​] if tweet['​place'​] != None else None, tweets_data)
 +
 +keywords = ('​architecture','#​architecture','​Architecture','​ARCHITECTURE'​)
 +
 +tweets[keywords[0]] = tweets['​text'​].apply(lambda tweet: word_in_text(keywords[0],​ tweet))
 +tweets[keywords[1]] = tweets['​text'​].apply(lambda tweet: word_in_text(keywords[1],​ tweet))
 +tweets[keywords[2]] = tweets['​text'​].apply(lambda tweet: word_in_text(keywords[2],​ tweet))
 +tweets[keywords[3]] = tweets['​text'​].apply(lambda tweet: word_in_text(keywords[3],​ tweet))
 +
 +relevant = ('​computer','​computational','​computation','​digital','​3D','​4D','​CAD','​computer-aided'​)
 +
 +tweets[relevant[0]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[0],​ tweet))
 +tweets[relevant[1]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[1],​ tweet))
 +tweets[relevant[2]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[2],​ tweet))
 +tweets[relevant[3]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[3],​ tweet))
 +tweets[relevant[4]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[4],​ tweet))
 +tweets[relevant[5]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[5],​ tweet))
 +tweets[relevant[6]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[6],​ tweet))
 +tweets[relevant[7]] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[7],​ tweet))
 +
 +tweets['​relevant'​] = tweets['​text'​].apply(lambda tweet: word_in_text(relevant[0],​ tweet) or word_in_text(relevant[1],​ tweet) or word_in_text(relevant[2],​ tweet) or word_in_text(relevant[3],​ tweet) or word_in_text(relevant[4],​ tweet) or word_in_text(relevant[5],​ tweet) or word_in_text(relevant[6],​ tweet) or word_in_text(relevant[7],​ tweet))
 +
 +def extract_link(text):​
 +    regex = r'​https?://​[^\s<>"​]+|www\.[^\s<>"​]+'​
 +    match = re.search(regex,​ text)
 +    if match:
 +        return match.group()
 +    return ''​
 +
 +def extract_txt(text):​
 +    regex = r'​https?://​[^\s<>"​]+|www\.[^\s<>"​]+'​
 +    match = re.search(regex,​ text)
 +    if match:
 +        return match.group()
 +    return ''​
 +
 +tweets['​link'​] = tweets['​text'​].apply(lambda tweet: extract_link(tweet))
 +
 +</​code>​
mine-tweets-py.txt ยท Last modified: 2017/01/12 14:06 by zoza