# -*- coding: utf-8 -*-
"""
Created on Thu Oct  3 13:09:28 2019

@author: claba
"""

# First of all, we import all the necessary libs
import nltk
import re
import unicodedata
import inflect
import math
from nltk.corpus import stopwords
from scipy.spatial import distance
import numpy as np


# ---------- SECTION 1 : DOCUMENTS PREPROCESSING ----------
    
# F1 : This function removes stop words from list of tokenized words

def remove_stopwords(wrd):
    new_wrd = [] #List of updated words
    
    for word in wrd:
        if word not in stopwords.words('italian'): # If the current word is not a stopword (ckeck using nltk)
            new_wrd.append(word)                   #appends it to the list
  
    return new_wrd




# F2 : This function removes punctuation from list of tokenized words

def remove_punctuation(wrd):
    new_wrds = []  #List of updated words
    
    for word in wrd:
        new_wrd = re.sub(r'[^\w\s]', '', word) # Replaces all punctuation word with "" using RegEx
        if new_wrd != '':
            new_wrds.append(new_wrd)           #And then appends all words different from "" to the list 
    
    return new_wrds


# F3 : This functions removes non ascii chars from a list of tokenized words

def remove_non_ascii(wrd):
    new_wrds = [] # List of updated words
    
    for word in wrd:
        new_wrd = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') # Filters non ascii chars
        new_wrds.append(new_wrd) # Appends the word to the list
    
    return new_wrds



# F4 : This function converts all characters to lowercase from a list of tokenized words

def to_lowercase(wrd):
    new_wrds = [] # List of updated words
    
    for word in wrd:
        new_wrd = word.lower()   # Converts the current word to lower case
        new_wrds.append(new_wrd) # And append it to the list
        
    return new_wrds




# F5 : This function replaces all integers occurences in list of tokenized words with textual representation

def replace_numbers(wrd):
    d = inflect.engine() # Libs inflect contains what we need
    new_wrds = [] # List of updated words
    
    for word in wrd:
        if word.isdigit(): # If the current word is a number
            new_wrd = d.number_to_words(word) # Converts it to its textual representation
            new_wrds.append(new_wrd) # And appends it to the list
        else:
            new_wrds.append(word) # If the current word is not a number appends it to the list
            
    return new_wrds



# The following function takes a record of a dataFrame containg our docs and preprocesses it's title and description 
# with all the previous functions

def preProcessing (x):
    
    x = re.sub(r'\'', ' ', x)
    x = nltk.word_tokenize(x) 
    
    # Uses previous functions
    #x = replace_numbers(x)
    x = to_lowercase(x)
    x = remove_stopwords(x)
    x = remove_punctuation(x)
    #x = stem_words(x)
    x = remove_non_ascii(x)
    
    
    return x # Returns the preprocessed doc


# ----------SECTION 3 : CREATION OF VOCABULARY, INVERTED INDECES AND SCORE FUNCTIONS----------

# This function takes the (preprocessed) documents are saved and 
# and returns the vocabulary of the indicated corpus
# Moreover, this function create the first inverted index we need in the form "word (key) : [list of docs that contain word] (value)".
# It takes the (preprocessed) docs and returns the reverted index as a dictionary.

def create_dictionary_and_inverted_index(proc_df):
    vocabulary = {} # The vocabulary is a dictionary of the form "Word : word_id"
    wid = 0 # word_id
    inverted_index = {} # Initializes the inverted index, in our case a dic
    all_den = proc_df["DENOMINAZIONE"] # The col we need
    for idx in range(len(all_den)): # for every document..
            den = all_den[idx]
            for word in den: # For every word in denomination
                if not word in list(vocabulary.keys()) : # if the word is not in the dic
                    vocabulary[word] = wid # adds it
                    wid += 1 # Update word_id
                    inverted_index[word] = ["row_"+str(idx)] # else creates a record in the dic for the current word and doc
                else :
                    inverted_index[word] = inverted_index[word] + ["row_"+str(idx)] # adds the current doc to the list of docs that contain the word               
    return([vocabulary,inverted_index]) # Returns the vocabulary

# This function takes a term, a riverted index and the total number of docs in the corpus to compute the IDF of the term
        
def IDFi(term, reverted_index, number_of_docs):
    return math.log10(number_of_docs/len(reverted_index[term]))




# This function create the second inverted index we need in the form "word (key) : [(doc that contain the word, TFID of the term in the doc),....]"
# It takes the (preprocessed) docs, the vocabulary and a list containig all the idfs and returns the reverted index as a dictionary.

def create_inverted_index_with_TFIDF(proc_df, number_of_docs, vocabulary, idfi):
    inverted_index2 = {} # Initializes the inverted index, in our case a dic
    all_den = proc_df["DENOMINAZIONE"]
    for idx in range(number_of_docs): # for every document
            den = all_den[idx]
            for word in den: # for every word in denomination
                if word in list(inverted_index2.keys()) : # if the word is inthe inverted index               
                    # adds to the index line of the current word a tuple that contains the current doc and its TFID for the current word. It uses the vocabulary to get the index of the word
                    # in the IDF list.
                    inverted_index2[word] = inverted_index2[word] + [("row_"+str(idx),((den).count(word)/len(den))*idfi[vocabulary[word]])] # Just applying the def
                else :
                    # Makes the same initializing the index line of the current word
                    inverted_index2[word] = [("row_"+str(idx),((den).count(word)/len(den))*idfi[vocabulary[word]])]

    return(inverted_index2)

# This function takes the two inverted indices , the (processed) query, the document the query has to be compared to and the vocabulary
# and returns the cosine similarity between them

def score(pquery, document_tuple, inverted_index, inverted_index_with_TFIDF, vocabulary, idfi):
    #the first vector is made by the all the tfid of the words in thw query. To build it we use a simple list comprehension
    # that computes the tfid for all the words in set(query) in order to not process the same word more times
    document = document_tuple[1]
    document_name = document_tuple[0]
    v1 = [((pquery.count(word)/len(pquery))*idfi[vocabulary[word]])  if word in vocabulary.keys() else 0 for word in set(pquery).union(set(document))]
    v2 = []
    
    # We don't need to work on vectors in R^(number of distinct words in query+document) becouse, in that case, all elements that 
    # are not simultaneously non zero will give a 0 contribute in the computation of the similarity, 
    # so we just need to work in R^(number of different words in query).
    #(the optimal solution will be to work in R^(dim of intersection of different words in query+ different words in document)) . 
    # In the end, to build the vector associated to the doc:
    for word in set(pquery).union(set(document)) : # for every distinc word in the query
        if word in vocabulary.keys(): # if the word is in the corpus vocabulary
            if document_name in inverted_index[word]: # if the document contains the word
                idx = inverted_index[word].index(document_name) # gets the index of the doc in the second inverted index using the first inverted index
                                                           # order will be the same
                v2.append(inverted_index_with_TFIDF[word][idx][1]) # appends the tfid of the current word for the selected doc
                                                                   # gettin it from the second inverted index
            else: # if the doc doesnt contain the word the associated component is 0 
                v2.append(0)
        else: # if the word is not in the vocabulary the associated component of the doc vectror is 0
            v2.append(0)
    if not all(v == 0 for v in v2): # if at least one word is in common            
        return (1 - distance.cosine(v1, v2)) # returns the cosine similarity
    else: # if the query and the doc haven't nothing in common their similarity is 0
        return 0


# ---------- SECTION 3 : OTHER FUNCTIONS ----------

# Jus trivial data cleaning for "COD_PROVINCIA" based on observed data.
def prov_foo_cleaning(e):
    try:
        if len(e)>6:
            return np.nan
        else:
            t = e[3:]
            if t == '000':
                return np.nan
            else:
                return t
    except:
        return np.nan