Notebook Fifteen |
Report |
Repository
Text Analysis
Andrea Leone
University of Trento
February 2022
import project
import os
import spacy
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import collections
import pickle
import grave
from tqdm.notebook import tqdm
records = project.sql_query("""
SELECT * FROM talks
WHERE transcript IS NOT NULL
ORDER BY slug ASC;
""")
df = project.create_dataframe_from(records)
Load the pipeline
nlp = spacy.load("en_core_web_lg")
Process all transcripts
docs_pkl = "data/docs.v4.pkl"
if not os.path.exists( docs_pkl ):
docs = list()
for _,record in tqdm( list(df.iterrows()) ):
docs.append( nlp( record["transcript"] ) )
with open( docs_pkl, "wb" ) as file:
pickle.dump(docs, file)
else:
with open( docs_pkl, "rb" ) as file:
docs = pickle.load(file)
Semantic distribution
Let's start with the token "creativity" as focus token
tag = nlp("creativity")[0]
dt1 = [(i, row) for i,row in df.iterrows() if tag.text in row["tags"]]
Distribution of the tagged talks in the years
years, counts = project.unzip_array(
collections.Counter([row["year"] for _,row in dt1]).items()
)
plt.figure( figsize=(45/2.54, 20/2.54) )
plt.stem( years, counts )
plt.show()
Define the dataset, split for the two decades: 2002-2012 and 2012-2022
ds1 = {
"02-12": [ (i,row) for i,row in dt1 if 2002 < row["year"] < 2013 ],
"12-22": [ (i,row) for i,row in dt1 if 2012 < row["year"] < 2023 ]
}
Define the logic to analyse the texts in the dataset: get the lemmas and collect the similarity score with the focused tag
def analyze(tag, rows, task="docs", pos="NOUN", to_df=False, norm_c=False, split_value=0.5):
if task == "docs":
lemmas = list()
similarities = dict()
empty_array = np.zeros(300)
for (i, row) in rows:
for token in docs[i]:
if token.pos_ == pos:
if not np.array_equal(token.vector, empty_array):
lemmas.append(token.lemma_)
similarities[ token.lemma_ ] = tag.similarity(token)
lemmas = collections.Counter(lemmas).most_common()
lemmas = [
[ lemma, similarities[lemma], count ] for lemma, count in lemmas
]
split_value = int( len(lemmas) * split_value )
results = sorted(
lemmas[:split_value], key = lambda x: x[1], reverse=True
)
cs = ["word", "tag-similarity", "word-count"]
sm = sum([c for _,_,c in results])
if norm_c:
cs.append("norm-wc")
results = [ [l,s,c,(c/sm)*1000] for l,s,c in results]
if to_df:
return pd.DataFrame(results, columns=cs)
else:
return results
if task == "tags":
tags = list()
for (i, row) in rows:
for tag in row["tags"]:
tags.append(tag)
results = collections.Counter(tags).most_common()
if to_df:
return pd.DataFrame(results, columns=["tag", "tag-count"])
else:
return results
def sort_values(rows, st=0.5):
lst = [ row for row in sorted(rows, key=lambda x : x[2]) if row[1] > st ]
return sorted(lst, key=lambda x : x[1], reverse=True)
The analysis extracts a particular part-of-speech role; in this case, all nouns (suggested alternative: verbs)
dr1 = dict()
for key in ds1.keys():
dr1[key] = analyze(tag, ds1[key], pos="NOUN", to_df=True, norm_c=True).iloc[:30]
print( "\nyear={}".format(key) )
print( dr1[key].iloc[:10] )
year=02-12
word tag-similarity word-count norm-wc
0 creativity 1.000000 71 2.151906
1 imagination 0.759219 24 0.727405
2 inspiration 0.702141 19 0.575862
3 innovation 0.664305 28 0.848639
4 passion 0.655068 22 0.666788
5 motivation 0.619925 9 0.272777
6 storytelling 0.590511 22 0.666788
7 curiosity 0.589669 6 0.181851
8 talent 0.589180 17 0.515245
9 brilliance 0.585092 2 0.060617
year=12-22
word tag-similarity word-count norm-wc
0 creativity 1.000000 143 1.939325
1 imagination 0.759219 53 0.718771
2 inspiration 0.702141 29 0.393290
3 ingenuity 0.693314 12 0.162741
4 innovation 0.664305 90 1.220554
5 passion 0.655068 38 0.515345
6 artistry 0.650408 4 0.054247
7 individuality 0.644738 7 0.094932
8 enthusiasm 0.641415 4 0.054247
9 storytelling 0.590511 31 0.420413
wls = list()
wld = dict()
for dr in dr1.values():
wls.extend([row["word"] for i,row in dr.iterrows()])
wls.reverse()
def plot_stem_difference(dr, wls, cs, mv=100):
plt.rcParams['font.family'] = 'Graphik'
fig, ax = plt.subplots( figsize=(10,10), dpi=180 )
for key in dr.keys():
wll = list()
for wl in wls:
nwc = dr1[key][ dr1[key]["word"] == wl ]["norm-wc"]
if len(nwc) > 0:
val = nwc.iloc[0]
wll.append( val if val < mv else mv )
else:
wll.append( 0 )
markerline, stemlines, baseline = plt.stem(wls, wll, markerfmt=cs[key], orientation="horizontal")
plt.setp(stemlines, linewidth=1, alpha=.5, color="lightgray")
plt.setp(baseline, linewidth=1, alpha=.1, color="lightgray")
plt.setp(markerline, linewidth=0, alpha=1, color=cs[key], marker="o")
plt.show()
plot_stem_difference(dr1, wls, cs={ "02-12": "deepskyblue", "12-22": "crimson" }, mv=3)
Use the sorted results of the analysis to create a graph for each decade
G = dict(); gvs = dict([
[key, sort_values( analyze(tag, ds1[key], pos="NOUN"), st=0.5 )] for key in ds1.keys()
])
for key in ds1.keys():
G[key] = nx.Graph()
G[key].add_weighted_edges_from([
(tag.text, lemma, w) for lemma,w,_ in gvs[key][1:]
])
for lemma, s,c in gvs[key] : G[key].add_node(lemma, similarity=s, count=c)
lemmas = [ nlp(lemma)[0] for lemma,_,_ in gvs[key] ]
for l1 in lemmas:
for l2 in lemmas:
w = l2.similarity(l1)
if 0.62 < w < 1 : G[key].add_edge(l1.text, l2.text, weight=w)
fig, ax = plt.subplots( figsize=(8,8), dpi=180 )
pos = nx.spring_layout(G[key], seed=5)
iet = len(G[key].edges()) - len(gvs[key])
nx.draw(G[key], pos, **{
"node_size": 0, "edge_color": (0,0,0,.02),
"edgelist": list(G[key].edges())[:len(gvs[key])-1]
})
nx.draw_networkx_nodes(G[key], pos, **{
"node_size": 2, "node_color": "darkgray",
"node_shape": "o", "alpha": .2,
"nodelist": list(G[key].nodes())
})
nx.draw_networkx_edges(G[key], pos, **{
"edge_color": ( 0,0,0, .04),
"edgelist": list(G[key].edges())[-iet:]
})
nx.draw_networkx_labels(G[key], pos, **{
"font_color": "black", "font_size": 6,
"font_family": "Graphik", "alpha": .8,
"horizontalalignment": "center",
"verticalalignment": "bottom"
})
fig.savefig("images/network.{}.{}.svg".format(tag.text, key), format="svg", dpi=1200)
plt.show()
Semantic tag distribution
Aggregate the relationship among tags
wts = dict(); gvs = dict()
for key in ds1.keys():
wts[key] = list()
for _,row in ds1[key]:
wts[key].extend(row["tags"])
gvs[key] = list()
for word in set(wts[key]):
if word != tag.text:
m = nlp(word)[0]
if not np.array_equal(m.vector, np.zeros(300)):
w = tag.similarity( m )
if w > 0.425:
gvs[key].append([ tag.text, word, w ])
Like before, create a graph from the results for both decades
G = dict()
for key in ds1.keys():
G[key] = nx.Graph()
G[key].add_weighted_edges_from([
(tag.text, word, w) for _,word,w in gvs[key]
])
lemmas = [ nlp(lemma)[0] for _,lemma,_ in gvs[key] ]
for l1 in lemmas:
for l2 in lemmas:
if l1.text != l2.text:
w = l2.similarity(l1)
if 0.5 < w < 1 : G[key].add_edge(l1.text, l2.text, weight=w)
fig, ax = plt.subplots( figsize=(8,8), dpi=180 )
pos = nx.spring_layout(G[key], seed=5)
nx.draw(G[key], pos, **{
"node_size": 0, "edge_color": (0,0,0,.02),
"edgelist": list(G[key].edges())[:len(gvs[key])-1]
})
nx.draw_networkx_nodes(G[key], pos, **{
"node_size": 2, "node_color": "darkgray",
"node_shape": "o", "alpha": .2,
"nodelist": list(G[key].nodes())
})
nx.draw_networkx_edges(G[key], pos, **{
"edge_color": ( 0,0,0, .04),
"edgelist": list(G[key].edges())[-len(gvs[key]):]
})
nx.draw_networkx_labels(G[key], pos, **{
"font_color": "black", "font_size": 6,
"font_family": "Graphik", "alpha": .8,
"horizontalalignment": "center",
"verticalalignment": "bottom"
})
plt.show()
word_counts = collections.Counter()
for doc in docs: word_counts.update(
collections.Counter([
token.lemma_ for token in doc
])
)
X, Y = project.unzip_array( word_counts.most_common() )
fig, ax = plt.subplots( figsize=(10,10), dpi=180 )
markerline, stemlines, baseline = plt.stem(X[:100], Y[:100])
plt.setp(stemlines, linewidth=2, alpha=.5, color="limegreen")
plt.setp(baseline, linewidth=2, alpha=.1, color="limegreen")
plt.setp(markerline, linewidth=0, alpha=1, color="limegreen", marker="o", markersize=0)
plt.gca().axes.get_xaxis().set_visible(False)
plt.show()