Merge branch 'nlp' of https://github.com/PragatiVerma18/Fantastic-Falcons-1.0 into webapp
This commit is contained in:
commit
3076819535
60
incorrect_answer_generation.py
Normal file
60
incorrect_answer_generation.py
Normal file
@ -0,0 +1,60 @@
|
||||
''' This module contains the class
|
||||
for generating incorrect alternative
|
||||
answers for a given answer
|
||||
'''
|
||||
import gensim
|
||||
import gensim.downloader as api
|
||||
from gensim.models import Word2Vec
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
class IncorrectAnswerGenerator:
|
||||
''' This class contains the methods
|
||||
for generating the incorrect answers
|
||||
given an answer
|
||||
'''
|
||||
|
||||
def __init__(self, document):
|
||||
# model required to fetch similar words
|
||||
self.model = api.load("glove-wiki-gigaword-100")
|
||||
self.all_words = []
|
||||
for sent in sent_tokenize(document):
|
||||
self.all_words.extend(word_tokenize(sent))
|
||||
self.all_words = list(set(self.all_words))
|
||||
|
||||
def get_all_options_dict(self, answer, num_options):
|
||||
''' This method returns a dict
|
||||
of 'num_options' options out of
|
||||
which one is correct and is the answer
|
||||
'''
|
||||
options_dict = dict()
|
||||
try:
|
||||
similar_words = self.model.similar_by_word(answer, topn=15)[::-1]
|
||||
|
||||
for i in range(1, num_options + 1):
|
||||
options_dict[i] = similar_words[i - 1][0]
|
||||
|
||||
except:
|
||||
self.all_sim = []
|
||||
for word in self.all_words:
|
||||
if word not in answer:
|
||||
try:
|
||||
self.all_sim.append(
|
||||
(self.model.similarity(answer, word), word))
|
||||
except:
|
||||
self.all_sim.append(
|
||||
(0.0, word))
|
||||
else:
|
||||
self.all_sim.append((-1.0, word))
|
||||
|
||||
self.all_sim.sort(reverse=True)
|
||||
|
||||
for i in range(1, num_options+1):
|
||||
options_dict[i] = self.all_sim[i-1][1]
|
||||
|
||||
replacement_idx = random.randint(1, num_options)
|
||||
|
||||
options_dict[replacement_idx] = answer
|
||||
|
||||
return options_dict
|
||||
199
question_extraction.py
Normal file
199
question_extraction.py
Normal file
@ -0,0 +1,199 @@
|
||||
'''This file contains the module for generating
|
||||
'''
|
||||
import nltk
|
||||
import spacy
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
||||
class QuestionExtractor:
|
||||
''' This class contains all the methods
|
||||
required for extracting questions from
|
||||
a given document
|
||||
'''
|
||||
|
||||
def __init__(self, num_questions):
|
||||
|
||||
self.num_questions = num_questions
|
||||
|
||||
# hash set for fast lookup
|
||||
self.stop_words = set(stopwords.words('english'))
|
||||
|
||||
# named entity recognition tagger
|
||||
self.ner_tagger = spacy.load('en_core_web_md')
|
||||
|
||||
self.vectorizer = TfidfVectorizer()
|
||||
|
||||
self.questions_dict = dict()
|
||||
|
||||
def get_questions_dict(self, document):
|
||||
'''
|
||||
Returns a dict of questions in the format:
|
||||
question_number: {
|
||||
question: str
|
||||
answer: str
|
||||
}
|
||||
|
||||
Params:
|
||||
* document : string
|
||||
Returns:
|
||||
* dict
|
||||
'''
|
||||
# find candidate keywords
|
||||
self.candidate_keywords = self.get_candidate_entities(document)
|
||||
|
||||
# set word scores before ranking candidate keywords
|
||||
self.set_tfidf_scores(document)
|
||||
|
||||
# rank the keywords using calculated tf idf scores
|
||||
self.rank_keywords()
|
||||
|
||||
# form the questions
|
||||
self.form_questions()
|
||||
|
||||
return self.questions_dict
|
||||
|
||||
def get_filtered_sentences(self, document):
|
||||
''' Returns a list of sentences - each of
|
||||
which has been cleaned of stopwords.
|
||||
Params:
|
||||
* document: a paragraph of sentences
|
||||
Returns:
|
||||
* list<str> : list of string
|
||||
'''
|
||||
sentences = sent_tokenize(document) # split documents into sentences
|
||||
|
||||
return [self.filter_sentence(sentence) for sentence in sentences]
|
||||
|
||||
def filter_sentence(self, sentence):
|
||||
'''Returns the sentence without stopwords
|
||||
Params:
|
||||
* sentence: A string
|
||||
Returns:
|
||||
* string
|
||||
'''
|
||||
words = word_tokenize(sentence)
|
||||
return ' '.join(w for w in words if w not in self.stop_words)
|
||||
|
||||
def get_candidate_entities(self, document):
|
||||
''' Returns a list of entities according to
|
||||
spacy's ner tagger. These entities are candidates
|
||||
for the questions
|
||||
|
||||
Params:
|
||||
* document : string
|
||||
Returns:
|
||||
* list<str>
|
||||
'''
|
||||
entities = self.ner_tagger(document)
|
||||
entity_list = []
|
||||
|
||||
for ent in entities.ents:
|
||||
entity_list.append(ent.text)
|
||||
|
||||
return list(set(entity_list)) # remove duplicates
|
||||
|
||||
def set_tfidf_scores(self, document):
|
||||
''' Sets the tf-idf scores for each word'''
|
||||
self.unfiltered_sentences = sent_tokenize(document)
|
||||
self.filtered_sentences = self.get_filtered_sentences(document)
|
||||
|
||||
self.word_score = dict() # (word, score)
|
||||
|
||||
# (word, sentence where word score is max)
|
||||
self.sentence_for_max_word_score = dict()
|
||||
|
||||
tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences)
|
||||
feature_names = self.vectorizer.get_feature_names()
|
||||
tf_idf_matrix = tf_idf_vector.todense().tolist()
|
||||
|
||||
num_sentences = len(self.unfiltered_sentences)
|
||||
num_features = len(feature_names)
|
||||
|
||||
for i in range(num_features):
|
||||
word = feature_names[i]
|
||||
self.sentence_for_max_word_score[word] = ""
|
||||
tot = 0.0
|
||||
cur_max = 0.0
|
||||
|
||||
for j in range(num_sentences):
|
||||
tot += tf_idf_matrix[j][i]
|
||||
|
||||
if tf_idf_matrix[j][i] > cur_max:
|
||||
cur_max = tf_idf_matrix[j][i]
|
||||
self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j]
|
||||
|
||||
# average score for each word
|
||||
self.word_score[word] = tot / num_sentences
|
||||
|
||||
def get_keyword_score(self, keyword):
|
||||
''' Returns the score for a keyword
|
||||
Params:
|
||||
* keyword : string of possible several words
|
||||
Returns:
|
||||
* float : score
|
||||
'''
|
||||
score = 0.0
|
||||
for word in word_tokenize(keyword):
|
||||
if word in self.word_score:
|
||||
score += self.word_score[word]
|
||||
return score
|
||||
|
||||
def get_corresponding_sentence_for_keyword(self, keyword):
|
||||
''' Finds and returns a sentence containing
|
||||
the keywords
|
||||
'''
|
||||
words = word_tokenize(keyword)
|
||||
for word in words:
|
||||
|
||||
if word not in self.sentence_for_max_word_score:
|
||||
continue
|
||||
|
||||
sentence = self.sentence_for_max_word_score[word]
|
||||
|
||||
all_present = True
|
||||
for w in words:
|
||||
if w not in sentence:
|
||||
all_present = False
|
||||
|
||||
if all_present:
|
||||
return sentence
|
||||
return ""
|
||||
|
||||
def rank_keywords(self):
|
||||
'''Rank keywords according to their score'''
|
||||
self.candidate_triples = [] # (score, keyword, corresponding sentence)
|
||||
|
||||
for candidate_keyword in self.candidate_keywords:
|
||||
self.candidate_triples.append([
|
||||
self.get_keyword_score(candidate_keyword),
|
||||
candidate_keyword,
|
||||
self.get_corresponding_sentence_for_keyword(candidate_keyword)
|
||||
])
|
||||
|
||||
self.candidate_triples.sort(reverse=True)
|
||||
|
||||
def form_questions(self):
|
||||
''' Forms the question and populates
|
||||
the question dict
|
||||
'''
|
||||
used_sentences = list()
|
||||
idx = 0
|
||||
cntr = 1
|
||||
num_candidates = len(self.candidate_triples)
|
||||
while cntr <= self.num_questions and idx < num_candidates:
|
||||
candidate_triple = self.candidate_triples[idx]
|
||||
|
||||
if candidate_triple[2] not in used_sentences:
|
||||
used_sentences.append(candidate_triple[2])
|
||||
|
||||
self.questions_dict[cntr] = {
|
||||
"question": candidate_triple[2].replace(
|
||||
candidate_triple[1],
|
||||
'_' * len(candidate_triple[1])),
|
||||
"answer": candidate_triple[1]
|
||||
}
|
||||
|
||||
cntr += 1
|
||||
idx += 1
|
||||
31
question_generation_main.py
Normal file
31
question_generation_main.py
Normal file
@ -0,0 +1,31 @@
|
||||
'''This module ties together the
|
||||
questions generation and incorrect answer
|
||||
generation modules
|
||||
'''
|
||||
from question_extraction import QuestionExtractor
|
||||
from incorrect_answer_generation import IncorrectAnswerGenerator
|
||||
|
||||
class QuestionGeneration:
|
||||
'''This class contains the method
|
||||
to generate questions
|
||||
'''
|
||||
|
||||
def __init__(self, num_questions, num_options):
|
||||
self.num_questions = num_questions
|
||||
self.num_options = num_options
|
||||
self.question_extractor = QuestionExtractor(num_questions)
|
||||
|
||||
def generate_questions_dict(self, document):
|
||||
self.questions_dict = self.question_extractor.get_questions_dict(document)
|
||||
self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
|
||||
|
||||
for i in range(1, self.num_questions+1):
|
||||
if i not in self.questions_dict:
|
||||
continue
|
||||
self.questions_dict[i]["options"] \
|
||||
= self.incorrect_answer_generator.get_all_options_dict(
|
||||
self.questions_dict[i]["answer"],
|
||||
self.num_options
|
||||
)
|
||||
|
||||
return self.questions_dict
|
||||
Loading…
x
Reference in New Issue
Block a user