Skip to content

Search Engine for domain specific french users

This tutorial is a work i've made as part of my internship at Safran. The objective is to create a search engine that take as input, a user query and return a shortlist of stored documents highly relevant to the search query. Parsing both the user query and the document in a same comparison space is a must. We use a projection in a vector space after common NLP tasks as tokenisation, stop word removal, lemmatization correction. Finally, unlike english as a standard for common NLP projects, we use tools that worked very well with french texts.

In this post, we will be building a semantic documents search engine

Prerequistes

  • Python >=3.7
  • NLTK
  • Pandas
  • Scikit-learn

Imports

import re, json
import unicodedata, string
import time
import operator
import numpy as np 
import pandas as pd
from collections import Counter
from collections import defaultdict
import nltk 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

data

Files used in the notebook are stored in the folder data of the repository

**1: Créer les keywords à partir d'une phrase

en se basant sur les mots d'un dictionnaire et un corpus de texte en passant par la tokenization, la correction, la lemmatization et le removeStopWords**

preprocessing

def get_dico():
    textdir = "liste.de.mots.francais.frgut_.txt"
    try:DICO = open(textdir,'r',encoding="utf-8").read()
    except: DICO = open(textdir,'r').read()

    return DICO


def remove_accents(input_str):
    """This method removes all diacritic marks from the given string"""
    norm_txt = unicodedata.normalize('NFD', input_str)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

def clean_sentence(texte):
    # Replace diacritics
    texte = remove_accents(texte)
    # Lowercase the document
    texte = texte.lower()
    # Remove Mentions
    texte = re.sub(r'@\w+', '', texte)
    # Remove punctuations
    texte = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', texte)
    # Remove the doubled space
    texte = re.sub(r'\s{2,}', ' ', texte)
    #remove whitespaces at the beginning and the end
    texte = texte.strip()

    return texte


def tokenize_sentence(texte):
        #clean the sentence 
    texte = clean_sentence(texte)
        #tokenize 
    liste_words = texte.split()
        #return 
    return liste_words

def strip_apostrophe(liste_words):
    get_radical = lambda word: word.split('\'')[-1]
    return list(map(get_radical,liste_words))

def pre_process(sentence):
    #remove '_' from the sentence 
    sentence = sentence.replace('_','')

    #get words fro the sentence 
    liste_words = tokenize_sentence(sentence)
    #cut out 1 or 2 letters ones 
    liste_words = [elt for elt in liste_words if len(elt)>2]
    #prendre le radical après l'apostrophe
    liste_words = strip_apostrophe(liste_words)
    print('\nsentence to words : ',liste_words)
    return liste_words

correction des mots

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])




def DICO_ET_CORRECTEUR():
    "cette fonction retourne la liste des mots de dictionnaire"
    DICO = get_dico()
    WORDS = Counter(pre_process(DICO)) #Counter prends un str et retourne une sorte de liste enrichie
    "correction des mots "
    N = sum(WORDS.values())
    P = lambda word: WORDS[word] / N #"Probability of `word`."

    correction = lambda word: max(candidates(word), key=P) #"Most probable
    return WORDS,correction

WORDS,CORRECTION = DICO_ET_CORRECTEUR()

stopwords et stemming(premier exemple)

## stopwords #//https://www.ranks.nl/stopwords/french
with open('stp_words_.txt','r') as f:
    STOPWORDS = f.read()

## bdd de stemmer
with open("sample_.json",'r',encoding='cp1252') as json_file:
    #json_file.seek(0)
    LISTE = json.load(json_file)
my_stemmer = lambda word: LISTE[word] if word in LISTE else word

fonction: SENTENCE_TO_CORRECT_WORDS

def SENTENCE_TO_CORRECT_WORDS(sentence):
    "cette fonction retourne la liste des mots du user"
    print('\n------------pre_process--------\n')
    liste_words = pre_process(sentence)
    print(liste_words)
    print('\n------------correction--------\n')
    liste_words = list(map(CORRECTION,liste_words))
    print(liste_words)
    print('\n------------stemming--------\n')
    liste_words = list(map(my_stemmer,liste_words))
    print(liste_words)
    print('\n------------remove stop-words--------\n')
    liste_words = [elt for elt in liste_words if elt not in STOPWORDS]
    print(liste_words)
    print('\n-------------------------------------\n')
    return liste_words
SENTENCE_TO_CORRECT_WORDS usage
SENTENCE_TO_CORRECT_WORDS('La PR reste au statut «\xa0Approuve(e)\xa0» et il n’y a pas de commande\"\'')
------------pre_process--------
['reste', 'statut', 'approuve', 'n’y', 'pas', 'commande']

------------correction--------
['reste', 'statut', 'approuve', 'non', 'pas', 'commande']

------------stemming--------
['rester', 'statut', 'approuver', 'non', 'pas', 'commander']

------------remove stop-words--------
['rester', 'statut', 'approuver', 'commander']

-------------------------------------
['rester', 'statut', 'approuver', 'commander']

Create dataset

Create dataset
def open_file(textdir):
found = False
try:texte = open(textdir,'r',encoding="utf-8").read();found=True
except:pass
try: texte = open(textdir,'r').read();found=True 
except: pass
if not found:
    texte = open(textdir,'r',encoding='cp1252').read();found=True
return  texte
def add_col(df_news,titre,keywords):
return df_news.append(dict(zip(df_news.columns,[titre, keywords])), ignore_index=True)

liste_pb = [elt for elt in open_file('liste_pb_.txt').split('\n') if elt]
df_new = df_news.drop(df_news.index)
for i,titre in enumerate(liste_pb):
keywords = ','.join(SENTENCE_TO_CORRECT_WORDS(titre))
df_new = add_col(df_new,titre,keywords)
df_new.head()
Output
                    Subject                                    Clean_Keyword
0  Message d'erreur : "Le fournisseur ARIBA n'exi...  message,erreur,fournisseur,aria,exister
1  Message d'erreur : "Commande d’article non aut...  message,erreur,commander,article,autoriser,oto
2  Message d'erreur : "Statut utilisateur FERM ac...  message,erreur,statut,utilisateur,actif,oto
3  Message d'erreur : "Statut systeme TCLO actif ...  message,erreur,statut,systeme,col,actif,nord
4  Message d'erreur "___ Cost center change could...  message,erreur,coat,centrer,changer,cold,affecter
5  Messaeg d'erreur "___ OTP change could not be ...  message,erreur,otp,changer,cold,affecter
6  Messaeg d'erreur "Entrez Centre de couts"          message,erreur,entrer,centrer,cout
7  Message d'erreur "Indiquez une seule imputatio...  message,erreur,indiquer,imputation,statistique
8  Message d'erreur "Imputations CO ont des centr...  message,erreur,imputation,centrer,profit
9  Message d'erreur "Poste ___ Ordre ___ depassem...  message,erreur,poster,ordre,depassement,budget
10  Message d'erreur "Entrez une quantite de comma...  message,erreur,entrer,quantite,commander
11  Message d'erreur "Indiquez la quantite"          message,erreur,indiquer,quantite
12  Message d'erreur "Le prix net doit etre superi...  message,erreur,prix,net,superieur
...  ...  ...
...  ...  ...
...  ...  ...
57  UO4-5 Commande | Envoi d'une commande manuelle  uo4,commander,envoi,commander,manuel
58  UO5-4 Reception | Anomalie workflow  uo5,reception,anomalie,workflow
59  UO5-1 Reception | Modification(s) de reception(s)  uo5,reception,modification,reception
60  UO5-2 Reception | Annulation(s) de reception(s)  uo5,reception,annulation,reception
61  UO5-3 Reception | Forcer la reception  uo5,reception,forcer,reception
62  UO3-5 Demande d'achat | Demande de support cre...  uo3,demander,achat,demander,support,creation
63  UO3-6 Demande d'achat | Demande de support mod...  uo3,demander,achat,demander,support,modification
64  UO3-7 Demande d'achat | Demande de support ann...  uo3,demander,achat,demander,support,annulation
65  UO4-2 Commande | Demande de support modificati...  uo4,commander,demander,support,modification,co...

tokenize and stemming(second exemple)

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
def wordLemmatizer(data,colname):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    file_clean_k =pd.DataFrame()
    for index,entry in enumerate(data):

        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if len(word)>1 and word not in stopwords.words('french') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
                file_clean_k.loc[index,colname] = str(Final_words)
                file_clean_k.loc[index,colname] = str(Final_words)
                file_clean_k=file_clean_k.replace(to_replace ="\[.", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ="'", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace =" ", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ='\]', value = '', regex = True)

    return file_clean_k


def wordLemmatizer_(sentence):
    #prendre une phrase que retourner un str (les mots sont separes par des ,)
    preprocessed_query = preprocessed_query = re.sub("\W+", " ", sentence).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    idx = 0
    colname = 'keyword_final'
    q_df.loc[idx,'q_clean'] =tokens
    print('\n\n---inputtoken');print(q_df.q_clean)
    print('\n\n---outputlemma');print(wordLemmatizer(q_df.q_clean,colname).loc[idx,colname])
    return wordLemmatizer(q_df.q_clean,colname).loc[idx,colname]

2: trouver la meilleure phrase dans une liste de phrase

method: TF-Idf

TfIdf stands for: Term Frequency Inverse Document Frequency

In order to compare the user input to existing sentence in database, we will go throught two process

  • Normalize database: Apply the pre-processing method to all sentences in the database. We then have, for each sentence, a list of keywords
  • For each keyword kw for each sentence st, we compute,
  • \(frqc(word,sentence)\) : occurrence of the keyword word in the sentence
  • \(doc\_frqc(word)\): number of sentences where the word appears
  • \(N\) = Number of sentences

\(tf(wd,stc) = \frac {frqc(wd,stc)}{ \sum_{stc} frqc(wd,stc) }\)

\(idf(wd) = \log(\frac{N}{doc\_frqc(wd)})\)

\(tfidf(wd,stc) = tf(wd,stc) *idf(wd)\)

Example
  • st1: The computer is down
  • st2: We need to change the computers
  • st3: Changements have to be handle by the IT

keywords per sentence

st1: [computer , down]
st2: [need, change, computer]
st3: [change, handle, IT]

vocabulary: [computer , down, need, change, handle, IT]

tf sentence1 sentence2 sentence3
computer ½ ½ 0
down 1 0 0
need 0 1 0
change 0 ½ ½
handle 0 0 1
IT 0 0 1

idf values for the keywords

N =number_of_sentences = 3

idf
computer log(3/2)
down log(3/1)
need log(3/1)
change log(3/2)
handle log(3/1)
IT log(3/1)

example for sentence 2: computing of the keywords tfidf values

\[ tfidf('computer') = tf('computer', sentence2)*idf('computer') = 1/2 * log(3/2)\\ tfidf('down') = 0 * log(3/1)\\ tfidf('need') = 1 * log(3/1)\\ tfidf('change') = 1/2 * log(3/2)\\ tfidf('handle') = 0 * log(3/1)\\ tfidf('IT') = 0 * log(3/1)\\ \]

vectorisation of the sentence 2

sentence2 <==> [ 0.5 * log(3 / 2), 0, 1 * log(3 / 2), 0.5 *  log(3 /2) , 0, 0]                                      

vectorisation of the sentences

\[ sentence1 <==> [\ 0.5*log(3/2),\ log(3/1),\ 0 ,\ 0,\ 0]\\ sentence2 <==> [\ 0.5*log(3/2),\ 0, 1*log(3/2),\ 0.5* log(3/2) ,\ 0,\ 0]\\ sentence3 <==> [\ 0, \ 0, \ 0,\ 0.5 * 1*log(3/2),\ log(3/1),\ 1*log(3/1)] \]

similarities between the user input and the sentences

  • user input: The IT have replaced all of the computers
  • keywords: [ 'IT', 'all', 'computer']
  • keywords found in dictionnary: [ 'IT','computer']
  • vectorization: [1,0,0,0,1]

scores

\[ sentence1: tfidf(sentence1)*vector = [\ 0.5*log(3/2),\ log(3/1),\ 0 ,\ 0,\ 0] *[1,0,0,0,1] = 0.5*log(3/2) \\ sentence1:0.5*log(3/2)\\ sentence2: 0.5*log(3/2)\\ sentence3: log(3/1) \]

fonction: cosine_similarity_T

def init(df_news):
  ##  Create Vocabulary
  vocabulary = set()
  for doc in df_news.Clean_Keyword:
      vocabulary.update(doc.split(','))
  vocabulary = list(vocabulary)# Intializating the tfIdf model
  tfidf = TfidfVectorizer(vocabulary=vocabulary)# Fit the TfIdf model
  tfidf.fit(df_news.Clean_Keyword)# Transform the TfIdf model
  tfidf_tran=tfidf.transform(df_news.Clean_Keyword)
  globals()['vocabulary'],globals()['tfidf'],globals()['tfidf_tran'] = vocabulary,tfidf,tfidf_tran

Create a vector for Query/search keywords

def gen_vector_T(tokens,df_news,vocabulary,tfidf,tfidf_tran):
  Q = np.zeros((len(vocabulary)))    
  x= tfidf.transform(tokens)
  #print(tokens[0].split(','))
  #print(keywords)
  for token in tokens[0].split(','):

      try:
          ind = vocabulary.index(token)
          Q[ind]  = x[0, tfidf.vocabulary_[token]]
          print(token,':',ind)
      except:
          print(token,':','not found')
          pass
  return Q

Cosine Similarity function

def cosine_sim(a, b):
    if not np.linalg.norm(a) and not np.linalg.norm(b): return -3
    if not np.linalg.norm(a):return -1
    if not np.linalg.norm(b):return -2
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim   

def cosine_similarity_T(k, query,df_news,vocabulary=None,tfidf=None,tfidf_tran=None,mine=True):
    try:
      vocabulary = globals()['vocabulary']
      tfidf = globals()['tfidf']
      tfidf_tran = globals()['tfidf_tran']
    except:
      print('up exception')
      init(df_news)
    q_df = pd.DataFrame(columns=['q_clean'])
    if mine:q_df.loc[0,'q_clean'] =','.join(SENTENCE_TO_CORRECT_WORDS(query))
    else:q_df.loc[0,'q_clean'] = wordLemmatizer_(query)


    print('\n\n---q_df');print(q_df)

    print('\n\n')
    d_cosines = []
    query_vector = gen_vector_T(q_df['q_clean'],df_news,vocabulary,tfidf,tfidf_tran )
    for d in tfidf_tran.A:
        d_cosines.append(cosine_sim(query_vector, d ))

    out = np.array(d_cosines).argsort()[-k:][::-1]
    #print("")
    d_cosines.sort()
    a = pd.DataFrame()
    for i,index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Subject'] = df_news['Subject'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

Test: cosine_similarity_T

def test(data,sentence,init_=False,mine=True):
  if not init_:
    deb = time.time();print('\n\n## ## ## ## ## #')
    init(df_news)
    print('\n## ## ## ## ## #temps init: ', time.time()-deb)
  deb = time.time();print('\n\n## ## ## ## ## #')
  print(cosine_similarity_T(10, sentence,df_news))
  print('\n## ## ## ## ## #temps methode 1: ', time.time()-deb)
sentence = 'Message d\'erreur \"La qte livree est differente de la qte facturee ; fonction impossible"'
sentence = 'erreur de conversion'
sentence = 'message d\'erreur'
sentence = "groupe d'acheteurs non défini"
sentence = "UO4"
sentence = "le fournisseur MDM n'existe pas"
init(df_new) 

cosine_similarity_T(10,sentence,df_new )
output
------------pre_process--------
['fournisseur', 'mdm', 'existe', 'pas']

------------correction--------
['fournisseur', 'mdm', 'existe', 'pas']

------------stemming--------
['fournisseur', 'mdm', 'exister', 'pas']

------------remove stop-words--------
['fournisseur', 'mdm', 'exister']

-------------------------------------

    index                   Subject                           Score
0  19  Message d'erreur "Le fournisseur MDM___ n’exis...  0.781490
1  0  Message d'erreur : "Le fournisseur ARIBA n'exi...  0.600296
2  20  Message d'erreur "Le fournisseur MDM___ est bl...  0.587467
3  14  Message d'erreur "Le centre de profit __ n'exi...  0.236420
4  33  Message d'erreur "Il existe des factures pour ...  0.214371
5  53  Message d'erreur "Fournisseur non present dans...  0.142208
6  18  Message d'erreur "Validation ___ : le compte _...  0.000000
7  30  Message d'erreur "Renseigner correctement le d...  0.000000
8  29  Message d'erreur "Article ___ non gere dans la...  0.000000
9  28  Message d'erreur "Fonctions oblig. Suivantes n...  0.000000
...  ...  ...

Conclusion and Discussions

  • The system is very fast and reliable
  • The shortlist, after tests is highly relevant to the user query with the best answer, appearing either first or second
  • Some methods employed must be adapted for english or other languages different of french