sort_sentences.py

import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt

done_msg = '> done.'

# Remove 1st argument from the
# list of command line arguments
argument_list = sys.argv[1:]

in_path     = ""
column_name = "sentence"
out_path    = ""
freq_path   = ""

dict_path   = ""
translate   = False
meaning     = None

sentence_count          = 20
consider_sentence_limit = 0
min_sentence_length     = 5

stemmed  = False
stemming = ''

for i, argument in enumerate(argument_list):

    if argument == "-is" or argument == "--input_sentences":
        in_path = argument_list[i+1]
    
    elif argument == "-if" or argument == "--input_frequencies":
        freq_path = argument_list[i+1]
        
    elif argument == "-o" or argument == "--out":
        out_path = argument_list[i+1]
        
    elif argument == "-col" or argument == "--sentence_column":
        column_name = argument_list[i+1]
    
    elif argument == "-sc" or argument == "--sentence_count":
        sentence_count = eval(argument_list[i+1])
        
    elif argument == "-csl" or argument == "--consider_n_sentences":
        consider_sentence_limit = eval(argument_list[i+1])
        
    elif argument == "-msl" or argument == "--min_sentence_length":
        min_sentence_length = eval(argument_list[i+1])
        
    elif argument == "-s" or argument == "--stemming":
        stemmed = True
        stemming = argument_list[i+1]
        
    elif argument == "-d" or argument == "--dictionary":
        translate = True
        dict_path = argument_list[i+1]
        
if not (in_path and out_path and freq_path):
    
    print("""
    -is: \t*(--input_sentences) input csv file containing sentences
    -if: \t*(--input_frequencies) input csv file containing frequencies
    -o:  \t*(--out) output csv file for sorted sentences
    -col:\t (--sentence_column) name of the column containing sentences in -is (def. 'sentence')
    -sc: \t (--sentence_count) number of sentences to return, default (-1 for max, 20 default)
    -csl:\t (--consider_n_sentences) number of considered sentences in the corpus
    -msl:\t (--min_sentence_length) minimum sentence length for consideration
    -d:  \t (--dictionary) dictionary file, if not used, translations won't be written
         \t expected cols: 'word', 'translation'
    -s:  \t (--stemming) stemming language, default none, available: ru
    
    *: required
    """)
    sys.exit()
    
    
if translate:
    print('loading dictionary...')
    dict_pairs =np.array(pd.read_csv(dict_path))
    meanings = {}
    for pair in dict_pairs:
        meanings[pair[0]] = pair[1]
    
    print(done_msg)
        

if stemming and stemming == 'ru':
    print('loading stemmer...')
    from lib.stemmers.ru_stemmer import stemmer
    def get_stems(sentence):
        sentence = stemmer.cleaning(sentence)
        sentence = stemmer.stemming(sentence)
        return sentence
    print(done_msg)
        
else:
    def get_stems(sentence):
        return sentence

# read sentences from csv file (generated by preprocess.py)
df_s        = pd.read_csv(in_path)

sentences   = list(df_s[column_name])
sentences   = [ str(sentence) for sentence in sentences if str(sentence) != "nan" ]

# if consider sentence limit isn't given by the user, consider all sentences
consider_sentence_limit = consider_sentence_limit if consider_sentence_limit else len(sentences) 
sentences   = sentences[:consider_sentence_limit]

# read sentence frequencies from csv file (generated by preprocess.py)
df_f        = pd.read_csv(freq_path)

frequencies = np.array(df_f)
frequencies[:,1] = frequencies[:,1].astype(int)

# get unique words from the frequency list
# put them in a set for O(1) lookup
words       = frequencies[:,0]
words_set   = set(words)

# get total count of numbers
freqs_cumulative = [0]
for freq in frequencies[:,1].astype(int):
    freqs_cumulative.append(freqs_cumulative[-1] + freq)
    
wcount = freqs_cumulative[-1]

# extract feature from sentences
# features: 
#   'order': words' orders by usage
#   'frequenct': order, normalized by wordcount
def sentences_to_feature(sentences, feature, frequencies, verbose=False):
    if verbose:
        print(f"extracting feature '{feature}' from sentences...")
    
    all_orders = []
    
    freq_list = list(frequencies[:,0])
    freq_dict = {}
    for i, word in enumerate(freq_list):
        freq_dict[word] = i
        
    freq_set = set(freq_list)
    
    if feature == 'orders':
        for sentence in sentences:
            sentence_orders = [ freq_dict[word] for word in sentence.split(' ') if word in freq_set]
            all_orders.append(sentence_orders)
    
    if feature == 'frequencies':
        freqs = np.array(frequencies[:,1]).astype(float)/wcount
        for sentence in sentences:
            sentence_orders = [ freqs[freq_dict[word]] for word in sentence.split(' ') if word in freq_set]
            all_orders.append(sentence_orders)
    
    if verbose:
        print(done_msg)
        
    return all_orders


# get ideal sentence with most average vocab coverage return
def get_ideal_index(sentences, frequencies):
    
    xfrequencies = np.copy(frequencies)
    
    xwords       = set(xfrequencies[:,0])

    #xsentences = [' '.join([token for token in set(sentence.split(' ')) if token in xwords ]) for sentence in sentences ]
    xsentences = [' '.join([token for token in set(sentence.split(' '))]) for sentence in sentences ]

    xsentence_frequencies = sentences_to_feature(xsentences, 'frequencies', frequencies)

    sums = [ np.sum(xsf)/len(xs) if xsf != [] else 0 for xsf, xs in zip(xsentence_frequencies, xsentences) ]

    return np.argmax(sums)
    
# order sentences iteratively
# at each step, all sentences are evaluated
# the best is chosen. sentence and word lists are updated accordingly
# (encountered words are removed, chosen sentence is removed)
# pretty inefficient ikr ¯\_(ツ)_/¯
def get_in_order(sentences, frequencies, sentence_count=20, metric="vanilla", verbose=False):
    
    print(f'calculating {metric} order coverage...')
    
    sentence_count = sentence_count if sentence_count != -1 else len(sentences)
    
    sentences_ordered = []

    remaining_sentences   = [ sentence for sentence in set(sentences.copy()) if len(sentence)>min_sentence_length ]
    
    remaining_sentences_stemmed = [ get_stems(sentence).replace('\n', '') for sentence in set(sentences.copy()) if len(sentence)>min_sentence_length ]
    
    remaining_frequencies = np.copy(frequencies)

    if verbose:
        print(len(remaining_sentences), len(remaining_frequencies))
    
    cumulative_return = 0

    vocab = set()
    vocab_ordered = []
    learning_history = [0]

    for i in range(sentence_count):

        if metric == "vanilla":
            chosen_index = 1

        elif metric == "max-avg":
            chosen_index = get_ideal_index(remaining_sentences_stemmed, remaining_frequencies)
        
        else:
            print("error: invalid metric")
            return None
        
        if chosen_index == -1:
            print("error")
            return None

        sentence = remaining_sentences.pop(chosen_index)
        sentence_stemmed = remaining_sentences_stemmed.pop(chosen_index)

        newvocab = [ word for word in set(sentence_stemmed.split(' ')) if word not in vocab and word in words_set ]
        filtered = ' '.join(newvocab)

        orders   = sentences_to_feature(list([filtered]), 'orders', remaining_frequencies)
        sfreqs   = sentences_to_feature(list([filtered]), 'frequencies', remaining_frequencies)

        vocab.update(newvocab)
        vocab_ordered.append(list(newvocab))

        new_percentage = 100*np.sum(sfreqs) if sfreqs else 0
        cumulative_return += new_percentage

        trunc = 100
        if verbose:
            print('{} - return: {:.2f}% ({:.2f}% cumulative)'.format(i, new_percentage, cumulative_return),
                  f'\n{chosen_index}:\t"{sentence[:trunc]}{"..."*int(len(sentence)>trunc)}"', '\n')

        learning_history.append(cumulative_return)
        sentences_ordered.append(sentence)

        for order in orders:
            remaining_frequencies = np.delete(remaining_frequencies, order, 0)
            
        if not remaining_sentences:
            break
                
    print(done_msg)
    return sentences_ordered, learning_history, vocab_ordered    
    
# get vocab at each history step by sentence
def get_cumulative_count(sentences):
    vocabs = [set([]) ]
    for sentence in sentences:
        tokens = [ token for token in set(get_stems(sentence).split(' ')) if token in words_set ]
        vocabs.append(vocabs[-1].copy())
        vocabs[-1].update(tokens)
    return vocabs

# take words and return their meanings words alongside them, as a formatted string
def words_to_dictentries(words):
    a = words.copy()
    
    a = [ f'({word}: {meanings[word] if word in meanings.keys() else "-"})' for word in a ]
    
    return ", ".join(a)

# get generated data from the script and write it into a csv file
def course_to_csv(ordered, history, vocab, sep='\t'):

    f = open(out_path+".csv", "w+")

    str_to_file = f'sentence{sep}cumulative coverage{sep}new words\n'

    for sentence, coverage, words in zip(ordered, history[1:], vocab):
        if translate:
            str_to_file += f'{sentence}{sep}{coverage}{sep}{words_to_dictentries(words)}\n'
        else:
            str_to_file += f'{sentence}{sep}{coverage}{sep}{" ".join(vocab)}\n'
    f.write(str_to_file)
    f.close()
    

# get frequencies of all words in all sentences
sentence_frequencies = sentences_to_feature(sentences, 'frequencies', frequencies)


# with the original order
ordered_vanilla, history_vanilla, vocab_vanilla = get_in_order(sentences, 
                                                               frequencies, 
                                                               sentence_count, 
                                                               metric="vanilla")
    
# with the ideal order
ordered_maxavg,  history_maxavg,  vocab_maxavg  = get_in_order(sentences, 
                                                               frequencies, 
                                                               sentence_count, 
                                                               metric="max-avg",
                                                               verbose=True)
# count cumulative vocabulary expansions
vcbva = get_cumulative_count(ordered_vanilla)
vcbma = get_cumulative_count(ordered_maxavg)

# output
course_to_csv(ordered_maxavg, history_maxavg, vocab_maxavg)

# done
print(f'execution complete.')

# report
print(f'{len(words)}\tunique words in corpus.')
print(f'{len(meanings.keys())}\tunique words in dictionary.')
print(f'{len(vcbma[-1])}\tunique words covered, making up {history_maxavg[-1]:.0f}% of the corpus.')

# constructed vcc curve values might exceed ideal curve when stemming is involved
# todo:
if stemmed:
    print('\nDISCLAIMER: the graph might be a bit off with `stemming`.')


# plot histories for comparison
plt.plot(100*np.array(freqs_cumulative[:len(vcbma[-1])+1])/wcount, label='VC ideal')
plt.plot(list(map(len, vcbma)), history_maxavg, label='VC constructed')
plt.plot(list(map(len, vcbva)), history_vanilla, label='VC for original order')

plt.title(f'Vocabulary coverage by number of studied words\n(first {sentence_count} sentences)')
plt.ylabel('vocabulary coverage (%)')
plt.xlabel('unique words studied')

plt.legend()
plt.show()