Merge branch 'nlp' of https://github.com/PragatiVerma18/Fantastic-Falcons-1.0 into webapp
This commit is contained in:
commit
3076819535
60
incorrect_answer_generation.py
Normal file
60
incorrect_answer_generation.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
''' This module contains the class
|
||||||
|
for generating incorrect alternative
|
||||||
|
answers for a given answer
|
||||||
|
'''
|
||||||
|
import gensim
|
||||||
|
import gensim.downloader as api
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class IncorrectAnswerGenerator:
|
||||||
|
''' This class contains the methods
|
||||||
|
for generating the incorrect answers
|
||||||
|
given an answer
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, document):
|
||||||
|
# model required to fetch similar words
|
||||||
|
self.model = api.load("glove-wiki-gigaword-100")
|
||||||
|
self.all_words = []
|
||||||
|
for sent in sent_tokenize(document):
|
||||||
|
self.all_words.extend(word_tokenize(sent))
|
||||||
|
self.all_words = list(set(self.all_words))
|
||||||
|
|
||||||
|
def get_all_options_dict(self, answer, num_options):
|
||||||
|
''' This method returns a dict
|
||||||
|
of 'num_options' options out of
|
||||||
|
which one is correct and is the answer
|
||||||
|
'''
|
||||||
|
options_dict = dict()
|
||||||
|
try:
|
||||||
|
similar_words = self.model.similar_by_word(answer, topn=15)[::-1]
|
||||||
|
|
||||||
|
for i in range(1, num_options + 1):
|
||||||
|
options_dict[i] = similar_words[i - 1][0]
|
||||||
|
|
||||||
|
except:
|
||||||
|
self.all_sim = []
|
||||||
|
for word in self.all_words:
|
||||||
|
if word not in answer:
|
||||||
|
try:
|
||||||
|
self.all_sim.append(
|
||||||
|
(self.model.similarity(answer, word), word))
|
||||||
|
except:
|
||||||
|
self.all_sim.append(
|
||||||
|
(0.0, word))
|
||||||
|
else:
|
||||||
|
self.all_sim.append((-1.0, word))
|
||||||
|
|
||||||
|
self.all_sim.sort(reverse=True)
|
||||||
|
|
||||||
|
for i in range(1, num_options+1):
|
||||||
|
options_dict[i] = self.all_sim[i-1][1]
|
||||||
|
|
||||||
|
replacement_idx = random.randint(1, num_options)
|
||||||
|
|
||||||
|
options_dict[replacement_idx] = answer
|
||||||
|
|
||||||
|
return options_dict
|
||||||
199
question_extraction.py
Normal file
199
question_extraction.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
'''This file contains the module for generating
|
||||||
|
'''
|
||||||
|
import nltk
|
||||||
|
import spacy
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
|
||||||
|
class QuestionExtractor:
|
||||||
|
''' This class contains all the methods
|
||||||
|
required for extracting questions from
|
||||||
|
a given document
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, num_questions):
|
||||||
|
|
||||||
|
self.num_questions = num_questions
|
||||||
|
|
||||||
|
# hash set for fast lookup
|
||||||
|
self.stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
# named entity recognition tagger
|
||||||
|
self.ner_tagger = spacy.load('en_core_web_md')
|
||||||
|
|
||||||
|
self.vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
|
self.questions_dict = dict()
|
||||||
|
|
||||||
|
def get_questions_dict(self, document):
|
||||||
|
'''
|
||||||
|
Returns a dict of questions in the format:
|
||||||
|
question_number: {
|
||||||
|
question: str
|
||||||
|
answer: str
|
||||||
|
}
|
||||||
|
|
||||||
|
Params:
|
||||||
|
* document : string
|
||||||
|
Returns:
|
||||||
|
* dict
|
||||||
|
'''
|
||||||
|
# find candidate keywords
|
||||||
|
self.candidate_keywords = self.get_candidate_entities(document)
|
||||||
|
|
||||||
|
# set word scores before ranking candidate keywords
|
||||||
|
self.set_tfidf_scores(document)
|
||||||
|
|
||||||
|
# rank the keywords using calculated tf idf scores
|
||||||
|
self.rank_keywords()
|
||||||
|
|
||||||
|
# form the questions
|
||||||
|
self.form_questions()
|
||||||
|
|
||||||
|
return self.questions_dict
|
||||||
|
|
||||||
|
def get_filtered_sentences(self, document):
|
||||||
|
''' Returns a list of sentences - each of
|
||||||
|
which has been cleaned of stopwords.
|
||||||
|
Params:
|
||||||
|
* document: a paragraph of sentences
|
||||||
|
Returns:
|
||||||
|
* list<str> : list of string
|
||||||
|
'''
|
||||||
|
sentences = sent_tokenize(document) # split documents into sentences
|
||||||
|
|
||||||
|
return [self.filter_sentence(sentence) for sentence in sentences]
|
||||||
|
|
||||||
|
def filter_sentence(self, sentence):
|
||||||
|
'''Returns the sentence without stopwords
|
||||||
|
Params:
|
||||||
|
* sentence: A string
|
||||||
|
Returns:
|
||||||
|
* string
|
||||||
|
'''
|
||||||
|
words = word_tokenize(sentence)
|
||||||
|
return ' '.join(w for w in words if w not in self.stop_words)
|
||||||
|
|
||||||
|
def get_candidate_entities(self, document):
|
||||||
|
''' Returns a list of entities according to
|
||||||
|
spacy's ner tagger. These entities are candidates
|
||||||
|
for the questions
|
||||||
|
|
||||||
|
Params:
|
||||||
|
* document : string
|
||||||
|
Returns:
|
||||||
|
* list<str>
|
||||||
|
'''
|
||||||
|
entities = self.ner_tagger(document)
|
||||||
|
entity_list = []
|
||||||
|
|
||||||
|
for ent in entities.ents:
|
||||||
|
entity_list.append(ent.text)
|
||||||
|
|
||||||
|
return list(set(entity_list)) # remove duplicates
|
||||||
|
|
||||||
|
def set_tfidf_scores(self, document):
|
||||||
|
''' Sets the tf-idf scores for each word'''
|
||||||
|
self.unfiltered_sentences = sent_tokenize(document)
|
||||||
|
self.filtered_sentences = self.get_filtered_sentences(document)
|
||||||
|
|
||||||
|
self.word_score = dict() # (word, score)
|
||||||
|
|
||||||
|
# (word, sentence where word score is max)
|
||||||
|
self.sentence_for_max_word_score = dict()
|
||||||
|
|
||||||
|
tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences)
|
||||||
|
feature_names = self.vectorizer.get_feature_names()
|
||||||
|
tf_idf_matrix = tf_idf_vector.todense().tolist()
|
||||||
|
|
||||||
|
num_sentences = len(self.unfiltered_sentences)
|
||||||
|
num_features = len(feature_names)
|
||||||
|
|
||||||
|
for i in range(num_features):
|
||||||
|
word = feature_names[i]
|
||||||
|
self.sentence_for_max_word_score[word] = ""
|
||||||
|
tot = 0.0
|
||||||
|
cur_max = 0.0
|
||||||
|
|
||||||
|
for j in range(num_sentences):
|
||||||
|
tot += tf_idf_matrix[j][i]
|
||||||
|
|
||||||
|
if tf_idf_matrix[j][i] > cur_max:
|
||||||
|
cur_max = tf_idf_matrix[j][i]
|
||||||
|
self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j]
|
||||||
|
|
||||||
|
# average score for each word
|
||||||
|
self.word_score[word] = tot / num_sentences
|
||||||
|
|
||||||
|
def get_keyword_score(self, keyword):
|
||||||
|
''' Returns the score for a keyword
|
||||||
|
Params:
|
||||||
|
* keyword : string of possible several words
|
||||||
|
Returns:
|
||||||
|
* float : score
|
||||||
|
'''
|
||||||
|
score = 0.0
|
||||||
|
for word in word_tokenize(keyword):
|
||||||
|
if word in self.word_score:
|
||||||
|
score += self.word_score[word]
|
||||||
|
return score
|
||||||
|
|
||||||
|
def get_corresponding_sentence_for_keyword(self, keyword):
|
||||||
|
''' Finds and returns a sentence containing
|
||||||
|
the keywords
|
||||||
|
'''
|
||||||
|
words = word_tokenize(keyword)
|
||||||
|
for word in words:
|
||||||
|
|
||||||
|
if word not in self.sentence_for_max_word_score:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentence = self.sentence_for_max_word_score[word]
|
||||||
|
|
||||||
|
all_present = True
|
||||||
|
for w in words:
|
||||||
|
if w not in sentence:
|
||||||
|
all_present = False
|
||||||
|
|
||||||
|
if all_present:
|
||||||
|
return sentence
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def rank_keywords(self):
|
||||||
|
'''Rank keywords according to their score'''
|
||||||
|
self.candidate_triples = [] # (score, keyword, corresponding sentence)
|
||||||
|
|
||||||
|
for candidate_keyword in self.candidate_keywords:
|
||||||
|
self.candidate_triples.append([
|
||||||
|
self.get_keyword_score(candidate_keyword),
|
||||||
|
candidate_keyword,
|
||||||
|
self.get_corresponding_sentence_for_keyword(candidate_keyword)
|
||||||
|
])
|
||||||
|
|
||||||
|
self.candidate_triples.sort(reverse=True)
|
||||||
|
|
||||||
|
def form_questions(self):
|
||||||
|
''' Forms the question and populates
|
||||||
|
the question dict
|
||||||
|
'''
|
||||||
|
used_sentences = list()
|
||||||
|
idx = 0
|
||||||
|
cntr = 1
|
||||||
|
num_candidates = len(self.candidate_triples)
|
||||||
|
while cntr <= self.num_questions and idx < num_candidates:
|
||||||
|
candidate_triple = self.candidate_triples[idx]
|
||||||
|
|
||||||
|
if candidate_triple[2] not in used_sentences:
|
||||||
|
used_sentences.append(candidate_triple[2])
|
||||||
|
|
||||||
|
self.questions_dict[cntr] = {
|
||||||
|
"question": candidate_triple[2].replace(
|
||||||
|
candidate_triple[1],
|
||||||
|
'_' * len(candidate_triple[1])),
|
||||||
|
"answer": candidate_triple[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
cntr += 1
|
||||||
|
idx += 1
|
||||||
31
question_generation_main.py
Normal file
31
question_generation_main.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
'''This module ties together the
|
||||||
|
questions generation and incorrect answer
|
||||||
|
generation modules
|
||||||
|
'''
|
||||||
|
from question_extraction import QuestionExtractor
|
||||||
|
from incorrect_answer_generation import IncorrectAnswerGenerator
|
||||||
|
|
||||||
|
class QuestionGeneration:
|
||||||
|
'''This class contains the method
|
||||||
|
to generate questions
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, num_questions, num_options):
|
||||||
|
self.num_questions = num_questions
|
||||||
|
self.num_options = num_options
|
||||||
|
self.question_extractor = QuestionExtractor(num_questions)
|
||||||
|
|
||||||
|
def generate_questions_dict(self, document):
|
||||||
|
self.questions_dict = self.question_extractor.get_questions_dict(document)
|
||||||
|
self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
|
||||||
|
|
||||||
|
for i in range(1, self.num_questions+1):
|
||||||
|
if i not in self.questions_dict:
|
||||||
|
continue
|
||||||
|
self.questions_dict[i]["options"] \
|
||||||
|
= self.incorrect_answer_generator.get_all_options_dict(
|
||||||
|
self.questions_dict[i]["answer"],
|
||||||
|
self.num_options
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.questions_dict
|
||||||
Loading…
x
Reference in New Issue
Block a user