Visualization of the frequency characteristics of words in texts

Working on the classification of natural texts, the question arose how to visually display the frequency characteristics of the text with and without text normalization.

The first thing that comes to mind is a bar chart, but it is impossible to put more than 20-30 words there - everything is too small and incomprehensible.

By normalization, I mean bringing words into their normal (canonical) form.

I made my attempt at a solution in the form of of the answer, but there was it would be interesting to look at other solutions and ideas.

Author: MaxU, 2018-03-05

1 answers

I decided to use WordCloud (c)Andreas Mueller, after tokenizing and normalizing the text using NLTK and pymorphy2.

Examples (the code is given below):

Anton Chekhov. Complete works and letters - ModernLib.Ru.txt: enter a description of the image here

Alexander Pushkin. Complete collection of poems - royallib.ru.txt: enter a description of the image here

Code:

import os
import requests
from operator import attrgetter
from pathlib import Path
#import pandas as pd
import nltk
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
import pymorphy2
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt
def read_stopwords(path='./stopwords-ru.txt', encoding='utf-8'):
    stopwords_en = stopwords.words('english')
    with open(path, encoding=encoding) as f:
        stopwords_ru = f.read().split('\n')
    return set(stopwords_ru) | set(stopwords_en)


def normalize_tokens(tokens):
    morph = pymorphy2.MorphAnalyzer()
    return [morph.parse(tok)[0].normal_form for tok in tokens]


def remove_stopwords(tokens, stopwords=None, min_length=4):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords and len(tok) >= min_length]
    return tokens


def plot_word_cloud(text, picture_fn='out.png', stopwords=None,
                    normalize=True, regexp=r'(?u)\b\w{4,}\b', **wc_kwargs):
    words = [w for sent in sent_tokenize(text)
             for w in regexp_tokenize(sent, regexp)]
    if normalize:
        words = normalize_tokens(words)
    if stopwords:
        words = remove_stopwords(words, stopwords)
    wc = WordCloud(**wc_kwargs).generate(' '.join(words))
    plt.figure(figsize=(12,10))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.savefig(picture_fn)

def get_text(url, encoding='utf-8', to_lower=True):
    url = str(url)
    if url.startswith('http'):
        r = requests.get(url)
        if not r.ok:
            r.raise_for_status()
        return r.text.lower() if to_lower else r.text
    elif os.path.exists(url):
        with open(url, encoding=encoding) as f:
            return f.read().lower() if to_lower else f.read()
    else:
        raise Exception('parameter [url] can be either URL or a filename')


stopwords_ru = read_stopwords('./stopwords-ru.txt')

# Понедельник начинается в субботу
url='https://www.e-reading.club/txt.php/55060/%D0%A1%D1%82%D1%80%D1%83%D0%B3%D0%B0%D1%86%D0%BA%D0%B8%D0%B9_-_%D0%9F%D0%BE%D0%BD%D0%B5%D0%B4%D0%B5%D0%BB%D1%8C%D0%BD%D0%B8%D0%BA_%D0%BD%D0%B0%D1%87%D0%B8%D0%BD%D0%B0%D0%B5%D1%82%D1%81%D1%8F_%D0%B2_%D1%81%D1%83%D0%B1%D0%B1%D0%BE%D1%82%D1%83.txt'
text = get_text(url)
plot_word_cloud(text, 'ponedelnik_norm.png', stopwords=stopwords_ru, max_words=100,
                background_color='black', normalize=True)

# Пушкин Александр. Полное собрание стихотворений - royallib.ru.txt
url='./Пушкин Александр. Полное собрание стихотворений - royallib.ru.txt'
text = get_text(url, encoding='cp1251')
plot_word_cloud(text, 'pushkin_norm.png', stopwords=stopwords_ru, max_words=100,
                background_color='black', normalize=True)

# Чехов Антон. Полное собрание сочинений и писем - ModernLib.Ru.txt
url = './Чехов Антон. Полное собрание сочинений и писем - ModernLib.Ru.txt'
text = get_text(url, encoding='cp1251')
plot_word_cloud(text, 'chekhov_norm.png', stopwords=stopwords_ru, max_words=100,
                background_color='black', normalize=True)
 5
Author: MaxU, 2018-03-07 15:47:04