Import texthero ... import texthero as hero
import pandas as pd
... load any text dataset with Pandas df = pd.read_csv(
"https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)
df.head(2 )
text topic 0 Claxton hunting first major medal\n\nBritish h... athletics 1 O'Sullivan could run in Worlds\n\nSonia O'Sull... athletics
Preprocess it ... df['text' ] = hero.clean(df['text' ])
text topic 0 claxton hunting first major medal british hurd... athletics 1 sullivan could run worlds sonia sullivan indic... athletics
... represent it df['tfidf' ] = (
hero.tfidf(df['text' ], max_features=100 )
)
df[["tfidf" , "topic" ]].head(2 )
tfidf topic 0 [0.0, 0.13194458247285848, 0.0, 0.0, 0.0, 0.0,... athletics 1 [0.0, 0.13056235989725676, 0.0, 0.205187581391... athletics
Reduce dimension and visualize the vector space df['pca' ] = hero.pca(df['tfidf' ])
hero.scatterplot(
df,
col='pca' ,
color='topic' ,
title="PCA BBC Sport news"
)
... need more? find named entities df['named_entities' ] = (
hero.named_entities(df['text' ]
)
df[['named_entities' , 'topic' ]].head(2 )
named_entities topic 0 [(claxton, ORG, 0, 7), (first, ORDINAL, 16, 21... athletics 1 [(sullivan, ORG, 0, 8), (sonia sullivan, PERSO... athletics
Show top words ... NUM_TOP_WORDS = 5
hero.top_words(df['text' ])[:NUM_TOP_WORDS]
text said 1338 first 790 england 749 game 681 one 671
And much more !