Module emse-mms.keys
Expand source code
import pandas as pd
import json
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
class Keys:
def __init__(self, target, user_input="input/user_input.json"):
self.similarity_matrix = list()
self.mapping = list()
self.target = target
self.modules = json.loads(open("input/modules.json").read())
self.user_input = json.loads(open(user_input).read())
self.user_df = pd.DataFrame()
self.tf_idf = TfidfVectorizer(stop_words="english")
def __convert_json_scalars__(self, scalar: list) -> str:
"""
Converts a list of scalars into a comma separated string. This is used to convert the skills, education, etc.
into a single string that can be appended to a DataFrame row.
:param scalar: A list of strings to be converted into a comma separated string.
:rtype: str
"""
temp_string = ""
for index, value in enumerate(scalar):
if index == len(scalar) - 1:
temp_string += value
else:
temp_string += value + ", "
return temp_string
def __read_input(self):
df = pd.DataFrame(
columns=[
"id",
"skills",
"education",
"work_experience",
"courses",
"focus",
"degree",
],
index=[0],
)
df["id"] = self.user_input["id"]
df["skills"] = self.__convert_json_scalars__(scalar=self.user_input["skills"])
df["education"] = self.__convert_json_scalars__(
scalar=self.user_input["education"]
)
df["work_experience"] = self.__convert_json_scalars__(
scalar=self.user_input["work_experience"]
)
df["courses"] = self.user_input["courses"]
df["focus"] = self.user_input["focus"]
df["degree"] = self.user_input["degree"]
self.user_df = df
def __read_modules(self):
df = pd.DataFrame(
columns=[
"id",
"moduleName",
"description",
"intro",
"keywords",
"objectives",
],
index=[i for i in range(len(self.modules))],
)
for index, module in enumerate(self.modules):
df.iloc[index]["id"] = module["id"]["$oid"]
df.iloc[index]["moduleName"] = module["moduleName"]
df.iloc[index]["description"] = module["description"]
df.iloc[index]["intro"] = module["intro"]
df.iloc[index]["keywords"] = self.__convert_json_scalars__(
scalar=module["keywords"]
)
df.iloc[index]["objectives"] = self.__convert_json_scalars__(
scalar=module["objectives"]
)
self.modules_df = df
def __analyze_modules(self):
keyword_matrix = self.tf_idf.fit_transform(self.modules_df["keywords"])
similarity_matrix = linear_kernel(keyword_matrix, keyword_matrix)
mapping = pd.Series(self.modules_df.index, index=self.modules_df["moduleName"])
self.similarity_matrix = similarity_matrix
self.mapping = mapping
def recommend_movies_based_on_plot(self, query, mapping, matrix):
movie_index = mapping[query]
# get similarity values with other movies
# similarity_score is the list of index and similarity matrix
similarity_score = list(enumerate(matrix[movie_index]))
# sort in descending order the similarity score of movie inputted with all the other movies
for index, s in enumerate(similarity_score):
id = s[0]
scores = s[1]
# sort scores in descending order
scores = sorted(scores, key=lambda x: x, reverse=True)
similarity_score[index] = (id, scores)
# Get the scores of the 15 most similar movies. Ignore the first movie.
similarity_score = similarity_score[1:15]
# return movie names using the mapping series
movie_indices = [i[0] for i in similarity_score]
return self.modules_df.iloc[movie_indices]
def run(self) -> str:
"""
Runs the entire system in a sequential order.
"""
self.__read_input()
self.__read_modules()
self.__analyze_modules()
recommendation: DataFrame = self.recommend_movies_based_on_plot(
self.target, mapping=self.mapping, matrix=self.similarity_matrix
)
return recommendation.to_json(orient="records")
def main():
"""
Main function to run the system. Instantiates the Keys class and runs the entry method.
"""
keys = Keys(target="Namfix")
keys.run()
if __name__ == "__main__":
main()
Functions
def main()
-
Main function to run the system. Instantiates the Keys class and runs the entry method.
Expand source code
def main(): """ Main function to run the system. Instantiates the Keys class and runs the entry method. """ keys = Keys(target="Namfix") keys.run()
Classes
class Keys (target, user_input='input/user_input.json')
-
Expand source code
class Keys: def __init__(self, target, user_input="input/user_input.json"): self.similarity_matrix = list() self.mapping = list() self.target = target self.modules = json.loads(open("input/modules.json").read()) self.user_input = json.loads(open(user_input).read()) self.user_df = pd.DataFrame() self.tf_idf = TfidfVectorizer(stop_words="english") def __convert_json_scalars__(self, scalar: list) -> str: """ Converts a list of scalars into a comma separated string. This is used to convert the skills, education, etc. into a single string that can be appended to a DataFrame row. :param scalar: A list of strings to be converted into a comma separated string. :rtype: str """ temp_string = "" for index, value in enumerate(scalar): if index == len(scalar) - 1: temp_string += value else: temp_string += value + ", " return temp_string def __read_input(self): df = pd.DataFrame( columns=[ "id", "skills", "education", "work_experience", "courses", "focus", "degree", ], index=[0], ) df["id"] = self.user_input["id"] df["skills"] = self.__convert_json_scalars__(scalar=self.user_input["skills"]) df["education"] = self.__convert_json_scalars__( scalar=self.user_input["education"] ) df["work_experience"] = self.__convert_json_scalars__( scalar=self.user_input["work_experience"] ) df["courses"] = self.user_input["courses"] df["focus"] = self.user_input["focus"] df["degree"] = self.user_input["degree"] self.user_df = df def __read_modules(self): df = pd.DataFrame( columns=[ "id", "moduleName", "description", "intro", "keywords", "objectives", ], index=[i for i in range(len(self.modules))], ) for index, module in enumerate(self.modules): df.iloc[index]["id"] = module["id"]["$oid"] df.iloc[index]["moduleName"] = module["moduleName"] df.iloc[index]["description"] = module["description"] df.iloc[index]["intro"] = module["intro"] df.iloc[index]["keywords"] = self.__convert_json_scalars__( scalar=module["keywords"] ) df.iloc[index]["objectives"] = self.__convert_json_scalars__( scalar=module["objectives"] ) self.modules_df = df def __analyze_modules(self): keyword_matrix = self.tf_idf.fit_transform(self.modules_df["keywords"]) similarity_matrix = linear_kernel(keyword_matrix, keyword_matrix) mapping = pd.Series(self.modules_df.index, index=self.modules_df["moduleName"]) self.similarity_matrix = similarity_matrix self.mapping = mapping def recommend_movies_based_on_plot(self, query, mapping, matrix): movie_index = mapping[query] # get similarity values with other movies # similarity_score is the list of index and similarity matrix similarity_score = list(enumerate(matrix[movie_index])) # sort in descending order the similarity score of movie inputted with all the other movies for index, s in enumerate(similarity_score): id = s[0] scores = s[1] # sort scores in descending order scores = sorted(scores, key=lambda x: x, reverse=True) similarity_score[index] = (id, scores) # Get the scores of the 15 most similar movies. Ignore the first movie. similarity_score = similarity_score[1:15] # return movie names using the mapping series movie_indices = [i[0] for i in similarity_score] return self.modules_df.iloc[movie_indices] def run(self) -> str: """ Runs the entire system in a sequential order. """ self.__read_input() self.__read_modules() self.__analyze_modules() recommendation: DataFrame = self.recommend_movies_based_on_plot( self.target, mapping=self.mapping, matrix=self.similarity_matrix ) return recommendation.to_json(orient="records")
Methods
def recommend_movies_based_on_plot(self, query, mapping, matrix)
-
Expand source code
def recommend_movies_based_on_plot(self, query, mapping, matrix): movie_index = mapping[query] # get similarity values with other movies # similarity_score is the list of index and similarity matrix similarity_score = list(enumerate(matrix[movie_index])) # sort in descending order the similarity score of movie inputted with all the other movies for index, s in enumerate(similarity_score): id = s[0] scores = s[1] # sort scores in descending order scores = sorted(scores, key=lambda x: x, reverse=True) similarity_score[index] = (id, scores) # Get the scores of the 15 most similar movies. Ignore the first movie. similarity_score = similarity_score[1:15] # return movie names using the mapping series movie_indices = [i[0] for i in similarity_score] return self.modules_df.iloc[movie_indices]
def run(self) ‑> str
-
Runs the entire system in a sequential order.
Expand source code
def run(self) -> str: """ Runs the entire system in a sequential order. """ self.__read_input() self.__read_modules() self.__analyze_modules() recommendation: DataFrame = self.recommend_movies_based_on_plot( self.target, mapping=self.mapping, matrix=self.similarity_matrix ) return recommendation.to_json(orient="records")