WhatsApp group chat analysis

Victor Ibarra
MCD-UNISON
Published in
7 min readDec 5, 2020

--

Based on a project done by kurasaiteja, you can review his full repository in here: github.com/kurasaiteja/Whatsapp-Analysis/blob/master/Whatsapp_Group_Chat_Analysis.ipynb

Photo by Christian Wiediger on Unsplash

First of all, its important to show you the libraries used for this analysis.

import os 
import re
import emoji
import plotly.express as px
import regex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import PIL
import datetime
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline

Then we make some definitions.

def startsWithDateAndTime(s):
pattern = '^\d{1,2}\/\d{1,2}\/\d{2} \d{1,2}\:\d{1,2} - '
result = re.match(pattern, s)
if result:
return True
return False
def findAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji
'[\w]+ ?[^\s\u1f300-\u1f5ff]:', # Name and Emoji
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(' ')
message = ' '.join(splitLine[1:])
if findAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
def split_count(text):
emoji_list = []
data = regex.findall('\W', text)
for word in data:
if any(char in emoji.UNICODE_EMOJI for char in word):
emoji_list.append(word)
return emoji_list

Read the file.

parsedData = [] # List to keep track of data so it can be used by a Pandas dataframeconversationPath = 'SpiceBoys.txt'print(conversationPath)
SpiceBoys.txt
with open(conversationPath, encoding="utf-8") as fp:
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
print(line)
if not line:
break
line = line.strip()
if startsWithDateAndTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)

Building database

df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])
df["Message"] = df["Message"].str.lower()

df.head();
df = df.dropna()

I might protect my friend’s privacy, so:

author_list = df.Author.unique()
nicknames = ['Fercho', 'Cheve', 'Monchis', 'Toro', 'Colo', 'Weto','Chewbacca','Cuy', 'Enano' ]
df['Author'].replace(author_list, nicknames, inplace=True)

df.head()
total_messages = df.shape[0]
print('Mensajes totales: ',total_messages)
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
print('Mensajes multimedia: ', media_messages)
Mensajes totales: 6200
Mensajes multimedia: 0

We can make some other stats:

Emoji identification:

df["emoji"] = df["Message"].apply(split_count)
#print(df["emoji"])

Links identification:

URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)
links
45

Our DataFrame will look like this:

media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))

messages_df

Now we are about to identify days of the week:

def f(i):
l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return l[i];
day_df=pd.DataFrame(messages_df["Message"])
day_df['day_of_date'] = messages_df['Date'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(f)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)

Author stats:

l = messages_df.Author.unique()

for i in range(len(l)):
# Filtering out messages of particular user
req_df= messages_df[messages_df["Author"] == l[i]]
# req_df will contain messages of only one particular user
print(f'Stats of {l[i]} -')
# shape will print number of rows which indirectly means the number of messages
print('Messages Sent', req_df.shape[0])
#Word_Count contains of total words in one message. Sum of all words/ Total Messages will yield words per message
words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
print('Words per message', words_per_message)
#media conists of media messages
media = messages_df[messages_df['Author'] == l[i]].shape[0]
print('Media Messages Sent', media)
# emojis conists of total emojis
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
#links consist of total links
links = sum(req_df["urlcount"])
print('Links Sent', links)
print()
Stats of Fercho -
Messages Sent 1805
Words per message 3.329085872576177
Media Messages Sent 1805
Emojis Sent 164
Links Sent 13

Stats of Cheve -
Messages Sent 938
Words per message 3.0309168443496803
Media Messages Sent 938
Emojis Sent 77
Links Sent 8

Stats of Monchis -
Messages Sent 471
Words per message 3.9660297239915074
Media Messages Sent 471
Emojis Sent 10
Links Sent 7

Stats of Toro -
Messages Sent 749
Words per message 3.6328437917222964
Media Messages Sent 749
Emojis Sent 27
Links Sent 3

Stats of Colo -
Messages Sent 420
Words per message 3.0404761904761903
Media Messages Sent 420
Emojis Sent 93
Links Sent 4

Stats of Weto -
Messages Sent 710
Words per message 3.5619718309859154
Media Messages Sent 710
Emojis Sent 99
Links Sent 3

Stats of Chewbacca -
Messages Sent 566
Words per message 3.3321554770318023
Media Messages Sent 566
Emojis Sent 55
Links Sent 3

Stats of Cuy -
Messages Sent 112
Words per message 3.2767857142857144
Media Messages Sent 112
Emojis Sent 13
Links Sent 0

Stats of Enano -
Messages Sent 429
Words per message 2.86013986013986
Media Messages Sent 429
Emojis Sent 214
Links Sent 4

Counting words:

text = " ".join(review for review in messages_df.Message)
print ("There are {} words in all the messages.".format(len(text)))

There are 130605 words in all the messages.

Stopwords and WordCloud:

stopwords = set(STOPWORDS)
# REGREX = '[ja]?(ja)+[ja]?'
stopwords.update(["de", "la", "el", "lo", "la", "de", "que","del","si","esa","ese","al"
"es","en","pero","los","un","con","le", "porque", "como", "una"
, "para", "es","al", "las", "Multimedia", "omitido", "jajaja", "les",
"todo", "ya", "jaja", "jajajaja", "te", "su", "ya", "jajajajaja", "se","yo","eso",
"cuando", "va", "ni", "son","pues","aqui","hoy","neta","por","simon","voy","ahi","asi",
"tu", "nos", "ok","mi","ok","jajajajajaja","era", "che", "ir", "más","vdd", "muy", "ala",
"jajajaj","hay","Jajaja","alguien","así","esta","eres","gracias", "tambien",
"aqui","está","hace","tiene","vez","nomas","ando","otro","https","estoy"
])
wordcloud = WordCloud(stopwords=stopwords,background_color="navy",colormap='rainbow' ).generate(text)

plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Emojis counting:

total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)
89total_emojis_list = list([a for b in messages_df.emoji for a in b])


emoji_dict = dict(Counter(total_emojis_list))

emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

print(emoji_dict)
[('🤣', 197), ('❤', 38), ('😂', 38), ('🏻', 34), ('🤔', 29), ('🏼', 29), ('😆', 27), ('👍', 26), ('💕', 17), ('👌', 17), ('👀', 16), ('💔', 15), ('😘', 13), ('🤤', 13), ('♂', 12), ('🤙', 12), ('🌚', 12), ('😱', 10), ('\U0001f974', 10), ('😬', 9), ('\U0001f92e', 8), ('😯', 8), ('🤷', 7), ('🍻', 7), ('😎', 7), ('💙', 6), ('🏿', 5), ('🤢', 5), ('🙌', 5), ('🤓', 5), ('✌', 4), ('😨', 4), ('\U0001f970', 4), ('\U0001f92d', 4), ('😞', 4), ('😍', 4), ('😅', 4), ('🙄', 3), ('☹', 3), ('\U0001f92a', 3), ('😉', 3), ('😔', 3), ('\U0001f928', 3), ('😁', 3), ('✨', 3), ('💖', 3), ('🤦', 3), ('☠', 2), ('😑', 2), ('📝', 2), ('😭', 2), ('😪', 2), ('👴', 2), ('😢', 2), ('💪', 2), ('🤗', 2), ('😏', 2), ('🕺', 2), ('👨', 2), ('🐧', 2), ('👏', 2), ('🐶', 2), ('🐙', 1), ('🙋', 1), ('🎤', 1), ('🎼', 1), ('🙆', 1), ('💅', 1), ('😊', 1), ('🙁', 1), ('🥂', 1), ('😖', 1), ('\U0001f973', 1), ('😰', 1), ('😒', 1), ('🤖', 1), ('✋', 1), ('\U0001f9a0', 1), ('\U0001f97a', 1), ('💩', 1), ('🐱', 1), ('\U0001f9b3', 1), ('😸', 1), ('🖖', 1), ('🏾', 1), ('🥗', 1), ('😕', 1), ('😓', 1), ('😛', 1)]
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df

Emoji’s plotting:

fig = px.pie(emoji_df, values ='count', names = 'emoji')fig.update_traces(textposition = 'inside', textinfo = 'percent+label')fig.show()

You can try this lines, to see what your buddie’s favorite emoji:

l = messages_df.Author.unique()for i in range(len(l)):dummy_df = messages_df[messages_df['Author'] == l[i]]total_emojis_list = list([a for b in dummy_df.emoji for a in b])emoji_dict = dict(Counter(total_emojis_list))emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)print('Emoji Distribution for', l[i])author_emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])fig = px.pie(author_emoji_df, values='count', names='emoji')fig.update_traces(textposition='inside', textinfo='percent+label')fig.show()fig.show()

Bussiest days? Really?

date_df = messages_df.groupby("Date").sum()date_df.reset_index(inplace=True)fig = px.line(date_df, x = "Date", y = "Word_Count")fig.update_xaxes(nticks=20)fig.show()

Dates with higher number of messages:

messages_df['Date'].value_counts().head(10).plot.barh()plt.xlabel('Number of Messages')plt.ylabel('Date')
fig = px.line_polar(day, r=’messagecount’, theta=’day_of_date’, line_close=True)fig.update_traces(fill=’toself’)fig.update_layout(polar=dict(radialaxis=dict(visible=True,range=[0,3000] # adjust this for better resolution depending on how much the group is used)),showlegend=False)fig.show()

What time is the bussiest?

messages_df['Time'].value_counts().head(10).plot.barh()plt.xlabel('Number of messages')plt.ylabel('Time')

--

--