Merge branch 'nlp' of https://github.com/PragatiVerma18/Fantastic-Falcons-1.0 into webapp

2020-10-10 06:04:58 +05:30 · 2020-10-10 06:04:58 +05:30 · 3076819535
commit 3076819535
parent 29c1546e0b 37af1c41fd
3 changed files with 290 additions and 0 deletions
--- a/incorrect_answer_generation.py
+++ b/incorrect_answer_generation.py
@ -0,0 +1,60 @@
+''' This module contains the class
+for generating incorrect alternative
+answers for a given answer
+'''
+import gensim
+import gensim.downloader as api
+from gensim.models import Word2Vec
+from nltk.tokenize import sent_tokenize, word_tokenize
+import random
+import numpy as np
+
+class IncorrectAnswerGenerator:
+    ''' This class contains the methods
+    for generating the incorrect answers
+    given an answer
+    '''
+
+    def __init__(self, document):
+        # model required to fetch similar words
+        self.model = api.load("glove-wiki-gigaword-100")
+        self.all_words = []
+        for sent in sent_tokenize(document):
+            self.all_words.extend(word_tokenize(sent))
+        self.all_words = list(set(self.all_words))
+
+    def get_all_options_dict(self, answer, num_options):
+        ''' This method returns a dict
+        of 'num_options' options out of
+        which one is correct and is the answer
+        '''
+        options_dict = dict()
+        try:
+            similar_words = self.model.similar_by_word(answer, topn=15)[::-1]
+
+            for i in range(1, num_options + 1):
+                options_dict[i] = similar_words[i - 1][0]
+
+        except:
+            self.all_sim = []
+            for word in self.all_words:
+                if word not in answer:
+                    try:
+                        self.all_sim.append(
+                            (self.model.similarity(answer, word), word))
+                    except:
+                        self.all_sim.append(
+                            (0.0, word))
+                else:
+                    self.all_sim.append((-1.0, word))
+
+            self.all_sim.sort(reverse=True)
+
+            for i in range(1, num_options+1):
+                options_dict[i] = self.all_sim[i-1][1]
+
+        replacement_idx = random.randint(1, num_options)
+
+        options_dict[replacement_idx] = answer
+
+        return options_dict
--- a/question_extraction.py
+++ b/question_extraction.py
@ -0,0 +1,199 @@
+'''This file contains the module for generating
+'''
+import nltk
+import spacy
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+class QuestionExtractor:
+    ''' This class contains all the methods
+    required for extracting questions from
+    a given document
+    '''
+
+    def __init__(self, num_questions):
+
+        self.num_questions = num_questions
+
+        # hash set for fast lookup
+        self.stop_words = set(stopwords.words('english'))
+
+        # named entity recognition tagger
+        self.ner_tagger = spacy.load('en_core_web_md')
+
+        self.vectorizer = TfidfVectorizer()
+
+        self.questions_dict = dict()
+
+    def get_questions_dict(self, document):
+        '''
+        Returns a dict of questions in the format:
+        question_number: {
+            question: str
+            answer: str
+        }
+
+        Params:
+            * document : string
+        Returns:
+            * dict
+        '''
+        # find candidate keywords
+        self.candidate_keywords = self.get_candidate_entities(document)
+
+        # set word scores before ranking candidate keywords
+        self.set_tfidf_scores(document)
+
+        # rank the keywords using calculated tf idf scores
+        self.rank_keywords()
+
+        # form the questions
+        self.form_questions()
+
+        return self.questions_dict
+
+    def get_filtered_sentences(self, document):
+        ''' Returns a list of sentences - each of
+        which has been cleaned of stopwords.
+        Params:
+                * document: a paragraph of sentences
+        Returns:
+                * list<str> : list of string
+        '''
+        sentences = sent_tokenize(document)  # split documents into sentences
+
+        return [self.filter_sentence(sentence) for sentence in sentences]
+
+    def filter_sentence(self, sentence):
+        '''Returns the sentence without stopwords
+        Params:
+                * sentence: A string
+        Returns:
+                * string
+        '''
+        words = word_tokenize(sentence)
+        return ' '.join(w for w in words if w not in self.stop_words)
+
+    def get_candidate_entities(self, document):
+        ''' Returns a list of entities according to
+        spacy's ner tagger. These entities are candidates
+        for the questions
+
+        Params:
+                * document : string
+        Returns:
+                * list<str>
+        '''
+        entities = self.ner_tagger(document)
+        entity_list = []
+
+        for ent in entities.ents:
+            entity_list.append(ent.text)
+
+        return list(set(entity_list))  # remove duplicates
+
+    def set_tfidf_scores(self, document):
+        ''' Sets the tf-idf scores for each word'''
+        self.unfiltered_sentences = sent_tokenize(document)
+        self.filtered_sentences = self.get_filtered_sentences(document)
+
+        self.word_score = dict()  # (word, score)
+
+        # (word, sentence where word score is max)
+        self.sentence_for_max_word_score = dict()
+
+        tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences)
+        feature_names = self.vectorizer.get_feature_names()
+        tf_idf_matrix = tf_idf_vector.todense().tolist()
+
+        num_sentences = len(self.unfiltered_sentences)
+        num_features = len(feature_names)
+
+        for i in range(num_features):
+            word = feature_names[i]
+            self.sentence_for_max_word_score[word] = ""
+            tot = 0.0
+            cur_max = 0.0
+
+            for j in range(num_sentences):
+                tot += tf_idf_matrix[j][i]
+
+                if tf_idf_matrix[j][i] > cur_max:
+                    cur_max = tf_idf_matrix[j][i]
+                    self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j]
+
+            # average score for each word
+            self.word_score[word] = tot / num_sentences
+
+    def get_keyword_score(self, keyword):
+        ''' Returns the score for a keyword
+        Params:
+            * keyword : string of possible several words
+        Returns:
+            * float : score
+        '''
+        score = 0.0
+        for word in word_tokenize(keyword):
+            if word in self.word_score:
+                score += self.word_score[word]
+        return score
+
+    def get_corresponding_sentence_for_keyword(self, keyword):
+        ''' Finds and returns a sentence containing
+        the keywords
+        '''
+        words = word_tokenize(keyword)
+        for word in words:
+
+            if word not in self.sentence_for_max_word_score:
+                continue
+
+            sentence = self.sentence_for_max_word_score[word]
+
+            all_present = True
+            for w in words:
+                if w not in sentence:
+                    all_present = False
+
+            if all_present:
+                return sentence
+        return ""
+
+    def rank_keywords(self):
+        '''Rank keywords according to their score'''
+        self.candidate_triples = []  # (score, keyword, corresponding sentence)
+
+        for candidate_keyword in self.candidate_keywords:
+            self.candidate_triples.append([
+                self.get_keyword_score(candidate_keyword),
+                candidate_keyword,
+                self.get_corresponding_sentence_for_keyword(candidate_keyword)
+            ])
+
+        self.candidate_triples.sort(reverse=True)
+
+    def form_questions(self):
+        ''' Forms the question and populates
+        the question dict
+        '''
+        used_sentences = list()
+        idx = 0
+        cntr = 1
+        num_candidates = len(self.candidate_triples)
+        while cntr <= self.num_questions and idx < num_candidates:
+            candidate_triple = self.candidate_triples[idx]
+
+            if candidate_triple[2] not in used_sentences:
+                used_sentences.append(candidate_triple[2])
+
+                self.questions_dict[cntr] = {
+                    "question": candidate_triple[2].replace(
+                        candidate_triple[1],
+                        '_' * len(candidate_triple[1])),
+                    "answer": candidate_triple[1]
+                }
+
+                cntr += 1
+            idx += 1
--- a/question_generation_main.py
+++ b/question_generation_main.py
@ -0,0 +1,31 @@
+'''This module ties together the
+questions generation and incorrect answer
+generation modules
+'''
+from question_extraction import QuestionExtractor
+from incorrect_answer_generation import IncorrectAnswerGenerator
+
+class QuestionGeneration:
+	'''This class contains the method
+	to generate questions
+	'''
+
+	def __init__(self, num_questions, num_options):
+		self.num_questions = num_questions
+		self.num_options = num_options
+		self.question_extractor = QuestionExtractor(num_questions)
+
+	def generate_questions_dict(self, document):
+		self.questions_dict = self.question_extractor.get_questions_dict(document)
+		self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
+
+		for i in range(1, self.num_questions+1):
+			if i not in self.questions_dict:
+				continue
+			self.questions_dict[i]["options"] \
+				= self.incorrect_answer_generator.get_all_options_dict(
+						self.questions_dict[i]["answer"],
+						self.num_options
+						)
+
+		return self.questions_dict