Warning: Use of undefined constant PREG_PATTERN_VALID_LANGUAGE - assumed 'PREG_PATTERN_VALID_LANGUAGE' (this will throw an Error in a future version of PHP) in
/var/www/kucjica/emperors-wiki/inc/parser/xhtml.php on line
633
Warning: preg_replace(): Delimiter must not be alphanumeric or backslash in
/var/www/kucjica/emperors-wiki/inc/parser/xhtml.php on line
633
import re, json, codecs
import pandas as pd
from collections import Counter, OrderedDict
#read the data in into an array
tweets_data_path = 'tweets.json'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweets_data.append(json.loads(line))
except:
continue
# print len(tweets_data)
# return True if keyword is found in the text
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
# print tweets_data[0].keys()
# write all tweets to a txt file (to keep as an intermediary step and for mining with other software)
fileOut = codecs.open("architecture-tweets.txt", "w", encoding="utf-8-sig")
texts = [tweet['text'] for tweet in tweets_data]
for txt in texts:
fileOut.write(txt.encode('ascii','ignore'))
fileOut.close()
fileIn = codecs.open(r"architecture-tweets.txt", "r", encoding="utf-8-sig")
wordcount = Counter(fileIn.read().split())
for item in wordcount.items(): print("{}\t{}".format(*item))
# DATAFRAME
## create a dataframe
tweets = pd.DataFrame()
## add three columns to the dataframe
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)
keywords = ('architecture','#architecture','Architecture','ARCHITECTURE')
tweets[keywords[0]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[0], tweet))
tweets[keywords[1]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[1], tweet))
tweets[keywords[2]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[2], tweet))
tweets[keywords[3]] = tweets['text'].apply(lambda tweet: word_in_text(keywords[3], tweet))
relevant = ('computer','computational','computation','digital','3D','4D','CAD','computer-aided')
tweets[relevant[0]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet))
tweets[relevant[1]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[1], tweet))
tweets[relevant[2]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[2], tweet))
tweets[relevant[3]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[3], tweet))
tweets[relevant[4]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[4], tweet))
tweets[relevant[5]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[5], tweet))
tweets[relevant[6]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[6], tweet))
tweets[relevant[7]] = tweets['text'].apply(lambda tweet: word_in_text(relevant[7], tweet))
tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text(relevant[0], tweet) or word_in_text(relevant[1], tweet) or word_in_text(relevant[2], tweet) or word_in_text(relevant[3], tweet) or word_in_text(relevant[4], tweet) or word_in_text(relevant[5], tweet) or word_in_text(relevant[6], tweet) or word_in_text(relevant[7], tweet))
def extract_link(text):
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, text)
if match:
return match.group()
return ''
def extract_txt(text):
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, text)
if match:
return match.group()
return ''
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))