Merge branch 'nlp' of https://github.com/PragatiVerma18/Fantastic-Falcons-1.0 into webapp

2020-10-10 06:04:58 +05:30 · 2020-10-10 06:04:58 +05:30 · 3076819535
commit 3076819535
parent 29c1546e0b 37af1c41fd
3 changed files with 290 additions and 0 deletions
--- a/incorrect_answer_generation.py
+++ b/incorrect_answer_generation.py
@ -0,0 +1,60 @@
 ''' This module contains the class
 for generating incorrect alternative
 answers for a given answer
 '''
 import gensim
 import gensim.downloader as api
 from gensim.models import Word2Vec
 from nltk.tokenize import sent_tokenize, word_tokenize
 import random
 import numpy as np
 class IncorrectAnswerGenerator:
    ''' This class contains the methods
    for generating the incorrect answers
    given an answer
    '''
    def __init__(self, document):
        # model required to fetch similar words
        self.model = api.load("glove-wiki-gigaword-100")
        self.all_words = []
        for sent in sent_tokenize(document):
            self.all_words.extend(word_tokenize(sent))
        self.all_words = list(set(self.all_words))
    def get_all_options_dict(self, answer, num_options):
        ''' This method returns a dict
        of 'num_options' options out of
        which one is correct and is the answer
        '''
        options_dict = dict()
        try:
            similar_words = self.model.similar_by_word(answer, topn=15)[::-1]
            for i in range(1, num_options + 1):
                options_dict[i] = similar_words[i - 1][0]
        except:
            self.all_sim = []
            for word in self.all_words:
                if word not in answer:
                    try:
                        self.all_sim.append(
                            (self.model.similarity(answer, word), word))
                    except:
                        self.all_sim.append(
                            (0.0, word))
                else:
                    self.all_sim.append((-1.0, word))
            self.all_sim.sort(reverse=True)
            for i in range(1, num_options+1):
                options_dict[i] = self.all_sim[i-1][1]
        replacement_idx = random.randint(1, num_options)
        options_dict[replacement_idx] = answer
        return options_dict
--- a/question_extraction.py
+++ b/question_extraction.py
@ -0,0 +1,199 @@
 '''This file contains the module for generating
 '''
 import nltk
 import spacy
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize, word_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
 class QuestionExtractor:
    ''' This class contains all the methods
    required for extracting questions from
    a given document
    '''
    def __init__(self, num_questions):
        self.num_questions = num_questions
        # hash set for fast lookup
        self.stop_words = set(stopwords.words('english'))
        # named entity recognition tagger
        self.ner_tagger = spacy.load('en_core_web_md')
        self.vectorizer = TfidfVectorizer()
        self.questions_dict = dict()
    def get_questions_dict(self, document):
        '''
        Returns a dict of questions in the format:
        question_number: {
            question: str
            answer: str
        }
        Params:
            * document : string
        Returns:
            * dict
        '''
        # find candidate keywords
        self.candidate_keywords = self.get_candidate_entities(document)
        # set word scores before ranking candidate keywords
        self.set_tfidf_scores(document)
        # rank the keywords using calculated tf idf scores
        self.rank_keywords()
        # form the questions
        self.form_questions()
        return self.questions_dict
    def get_filtered_sentences(self, document):
        ''' Returns a list of sentences - each of
        which has been cleaned of stopwords.
        Params:
                * document: a paragraph of sentences
        Returns:
                * list<str> : list of string
        '''
        sentences = sent_tokenize(document)  # split documents into sentences
        return [self.filter_sentence(sentence) for sentence in sentences]
    def filter_sentence(self, sentence):
        '''Returns the sentence without stopwords
        Params:
                * sentence: A string
        Returns:
                * string
        '''
        words = word_tokenize(sentence)
        return ' '.join(w for w in words if w not in self.stop_words)
    def get_candidate_entities(self, document):
        ''' Returns a list of entities according to
        spacy's ner tagger. These entities are candidates
        for the questions
        Params:
                * document : string
        Returns:
                * list<str>
        '''
        entities = self.ner_tagger(document)
        entity_list = []
        for ent in entities.ents:
            entity_list.append(ent.text)
        return list(set(entity_list))  # remove duplicates
    def set_tfidf_scores(self, document):
        ''' Sets the tf-idf scores for each word'''
        self.unfiltered_sentences = sent_tokenize(document)
        self.filtered_sentences = self.get_filtered_sentences(document)
        self.word_score = dict()  # (word, score)
        # (word, sentence where word score is max)
        self.sentence_for_max_word_score = dict()
        tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences)
        feature_names = self.vectorizer.get_feature_names()
        tf_idf_matrix = tf_idf_vector.todense().tolist()
        num_sentences = len(self.unfiltered_sentences)
        num_features = len(feature_names)
        for i in range(num_features):
            word = feature_names[i]
            self.sentence_for_max_word_score[word] = ""
            tot = 0.0
            cur_max = 0.0
            for j in range(num_sentences):
                tot += tf_idf_matrix[j][i]
                if tf_idf_matrix[j][i] > cur_max:
                    cur_max = tf_idf_matrix[j][i]
                    self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j]
            # average score for each word
            self.word_score[word] = tot / num_sentences
    def get_keyword_score(self, keyword):
        ''' Returns the score for a keyword
        Params:
            * keyword : string of possible several words
        Returns:
            * float : score
        '''
        score = 0.0
        for word in word_tokenize(keyword):
            if word in self.word_score:
                score += self.word_score[word]
        return score
    def get_corresponding_sentence_for_keyword(self, keyword):
        ''' Finds and returns a sentence containing
        the keywords
        '''
        words = word_tokenize(keyword)
        for word in words:
            if word not in self.sentence_for_max_word_score:
                continue
            sentence = self.sentence_for_max_word_score[word]
            all_present = True
            for w in words:
                if w not in sentence:
                    all_present = False
            if all_present:
                return sentence
        return ""
    def rank_keywords(self):
        '''Rank keywords according to their score'''
        self.candidate_triples = []  # (score, keyword, corresponding sentence)
        for candidate_keyword in self.candidate_keywords:
            self.candidate_triples.append([
                self.get_keyword_score(candidate_keyword),
                candidate_keyword,
                self.get_corresponding_sentence_for_keyword(candidate_keyword)
            ])
        self.candidate_triples.sort(reverse=True)
    def form_questions(self):
        ''' Forms the question and populates
        the question dict
        '''
        used_sentences = list()
        idx = 0
        cntr = 1
        num_candidates = len(self.candidate_triples)
        while cntr <= self.num_questions and idx < num_candidates:
            candidate_triple = self.candidate_triples[idx]
            if candidate_triple[2] not in used_sentences:
                used_sentences.append(candidate_triple[2])
                self.questions_dict[cntr] = {
                    "question": candidate_triple[2].replace(
                        candidate_triple[1],
                        '_' * len(candidate_triple[1])),
                    "answer": candidate_triple[1]
                }
                cntr += 1
            idx += 1
--- a/question_generation_main.py
+++ b/question_generation_main.py
@ -0,0 +1,31 @@
 '''This module ties together the
 questions generation and incorrect answer
 generation modules
 '''
 from question_extraction import QuestionExtractor
 from incorrect_answer_generation import IncorrectAnswerGenerator
 class QuestionGeneration:
 	'''This class contains the method
 	to generate questions
 	'''
 	def __init__(self, num_questions, num_options):
 		self.num_questions = num_questions
 		self.num_options = num_options
 		self.question_extractor = QuestionExtractor(num_questions)
 	def generate_questions_dict(self, document):
 		self.questions_dict = self.question_extractor.get_questions_dict(document)
 		self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
 		for i in range(1, self.num_questions+1):
 			if i not in self.questions_dict:
 				continue
 			self.questions_dict[i]["options"] \
 				= self.incorrect_answer_generator.get_all_options_dict(
 						self.questions_dict[i]["answer"],
 						self.num_options
 						)
 		return self.questions_dict