Module emse-mms.models.meta
Expand source code
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class MetaRecommender:
"""
This class is used to recommend modules based on the module name, description, keywords and objectives. This is commonly
referred to as a Meta-Based Recommender System. The class takes in a path to a json file, a target module name and a
size of the recommendation list. The class will then return a list of
recommended modules based on the target module name.
"""
def __init__(self, path: str, target: str, size: int = 10):
self.df = pd.read_json(path)
self.featureDf = pd.DataFrame()
self.target = target
self.stop = stopwords.words("english")
self.cv = CountVectorizer()
self.similarity = None
self.size = size
@staticmethod
def _convert_module_id(df: pd.DataFrame()):
"""
This function is used to convert the module id from a mongo Object ID to a string.
:param df: the dataframe to be modified by the apply function
:return: the modified dataframe
"""
temp_id = df["id"]
temp_id = dict(temp_id)
temp_id = temp_id["$oid"]
df["_id"] = temp_id
return df
@staticmethod
def _scalar_to_str(df: pd.DataFrame()):
lst_k = list(df["keywords"])
lst_o = list(df["objectives"])
str_key_feat = ""
str_obj_feat = ""
for key in lst_k:
str_key_feat += str(key) + " "
for obj in lst_o:
str_obj_feat += str(obj) + " "
df["keywords"] = str_key_feat
df["objectives"] = str_obj_feat
return df
def _clean_data(self, cleanID, cleanScalar):
self.df = self.df.apply(cleanID, axis=1)
self.df = self.df.drop(["id"], axis=1)
self.df = self.df.reset_index(drop=True)
self.df = self.df.apply(cleanScalar, axis=1)
def _create_features(self):
features = []
for i in range(0, self.df.shape[0]):
features.append(
self.df["moduleName"][i]
+ " "
+ self.df["intro"][i]
+ " "
+ self.df["description"][i]
+ " "
+ self.df["keywords"][i]
+ " "
+ self.df["objectives"][i]
)
self.df["features"] = features
self.df.insert(1, "id", list(range(1, len(self.df) + 1)), True)
def _text_processing(self, col: pd.DataFrame()):
column = col.str.lower()
column = column.str.replace("[^a-z ]", "")
word_tokens = column.str.split()
keys = word_tokens.apply(
lambda x: [item for item in x if item not in self.stop]
)
for i in range(len(keys)):
keys[i] = " ".join(keys[i])
column = keys
return column
def _get_cosine(self):
count_matrix = self.cv.fit_transform(self.featureDf["cleaned_features"])
self.similarity = cosine_similarity(count_matrix)
def _get_recommendations(self):
try:
module_id = self.df[
self.df["moduleName"].str.lower() == str(self.target).lower()
]["id"].values[0]
score = list(enumerate(self.similarity[module_id]))
sorted_score = sorted(score, key=lambda x: x[1], reverse=True)
sorted_score = sorted_score[1:]
if self.size > len(sorted_score):
self.size = len(sorted_score)
results = []
i = 0
for item in sorted_score:
module_title = self.df[self.df["id"] == item[0]]["moduleName"].values[0]
module_oid = self.df[self.df["id"] == item[0]]["_id"].values[0]
print(i + 1, module_title, module_oid, item[1])
results.append(
{"title": module_title, "id": module_oid, "score": item[1]}
)
i += 1
if i > self.size - 1:
break
return results
except IndexError:
return {"error": "Module not found"}
def run(self):
"""
This function is used to run the recommender system. It recommends modules based on the
target module name passed in. The list will be sorted by the cosine similarity score.
The list will be in the format of
*[{"title": module_title, "id": module_oid, "score": cosine_similarity_score}, ...]*
.. todo:: Add a check to see if the target module name is in the database.
.. return:: a list of recommended modules or a dictionary an error message
"""
self._clean_data(self._convert_module_id, self._scalar_to_str)
self._create_features()
self.featureDf = self.df[["id", "features"]]
self.featureDf["cleaned_features"] = self._text_processing(
self.featureDf["features"]
)
self._get_cosine()
return self._get_recommendations()
Classes
class MetaRecommender (path: str, target: str, size: int = 10)
-
This class is used to recommend modules based on the module name, description, keywords and objectives. This is commonly referred to as a Meta-Based Recommender System. The class takes in a path to a json file, a target module name and a size of the recommendation list. The class will then return a list of recommended modules based on the target module name.
Expand source code
class MetaRecommender: """ This class is used to recommend modules based on the module name, description, keywords and objectives. This is commonly referred to as a Meta-Based Recommender System. The class takes in a path to a json file, a target module name and a size of the recommendation list. The class will then return a list of recommended modules based on the target module name. """ def __init__(self, path: str, target: str, size: int = 10): self.df = pd.read_json(path) self.featureDf = pd.DataFrame() self.target = target self.stop = stopwords.words("english") self.cv = CountVectorizer() self.similarity = None self.size = size @staticmethod def _convert_module_id(df: pd.DataFrame()): """ This function is used to convert the module id from a mongo Object ID to a string. :param df: the dataframe to be modified by the apply function :return: the modified dataframe """ temp_id = df["id"] temp_id = dict(temp_id) temp_id = temp_id["$oid"] df["_id"] = temp_id return df @staticmethod def _scalar_to_str(df: pd.DataFrame()): lst_k = list(df["keywords"]) lst_o = list(df["objectives"]) str_key_feat = "" str_obj_feat = "" for key in lst_k: str_key_feat += str(key) + " " for obj in lst_o: str_obj_feat += str(obj) + " " df["keywords"] = str_key_feat df["objectives"] = str_obj_feat return df def _clean_data(self, cleanID, cleanScalar): self.df = self.df.apply(cleanID, axis=1) self.df = self.df.drop(["id"], axis=1) self.df = self.df.reset_index(drop=True) self.df = self.df.apply(cleanScalar, axis=1) def _create_features(self): features = [] for i in range(0, self.df.shape[0]): features.append( self.df["moduleName"][i] + " " + self.df["intro"][i] + " " + self.df["description"][i] + " " + self.df["keywords"][i] + " " + self.df["objectives"][i] ) self.df["features"] = features self.df.insert(1, "id", list(range(1, len(self.df) + 1)), True) def _text_processing(self, col: pd.DataFrame()): column = col.str.lower() column = column.str.replace("[^a-z ]", "") word_tokens = column.str.split() keys = word_tokens.apply( lambda x: [item for item in x if item not in self.stop] ) for i in range(len(keys)): keys[i] = " ".join(keys[i]) column = keys return column def _get_cosine(self): count_matrix = self.cv.fit_transform(self.featureDf["cleaned_features"]) self.similarity = cosine_similarity(count_matrix) def _get_recommendations(self): try: module_id = self.df[ self.df["moduleName"].str.lower() == str(self.target).lower() ]["id"].values[0] score = list(enumerate(self.similarity[module_id])) sorted_score = sorted(score, key=lambda x: x[1], reverse=True) sorted_score = sorted_score[1:] if self.size > len(sorted_score): self.size = len(sorted_score) results = [] i = 0 for item in sorted_score: module_title = self.df[self.df["id"] == item[0]]["moduleName"].values[0] module_oid = self.df[self.df["id"] == item[0]]["_id"].values[0] print(i + 1, module_title, module_oid, item[1]) results.append( {"title": module_title, "id": module_oid, "score": item[1]} ) i += 1 if i > self.size - 1: break return results except IndexError: return {"error": "Module not found"} def run(self): """ This function is used to run the recommender system. It recommends modules based on the target module name passed in. The list will be sorted by the cosine similarity score. The list will be in the format of *[{"title": module_title, "id": module_oid, "score": cosine_similarity_score}, ...]* .. todo:: Add a check to see if the target module name is in the database. .. return:: a list of recommended modules or a dictionary an error message """ self._clean_data(self._convert_module_id, self._scalar_to_str) self._create_features() self.featureDf = self.df[["id", "features"]] self.featureDf["cleaned_features"] = self._text_processing( self.featureDf["features"] ) self._get_cosine() return self._get_recommendations()
Methods
def run(self)
-
This function is used to run the recommender system. It recommends modules based on the target module name passed in. The list will be sorted by the cosine similarity score. The list will be in the format of
[{"title": module_title, "id": module_oid, "score": cosine_similarity_score}, …]
TODO
Add a check to see if the target module name is in the database.
Return: a list of recommended modules or a dictionary an error message
Expand source code
def run(self): """ This function is used to run the recommender system. It recommends modules based on the target module name passed in. The list will be sorted by the cosine similarity score. The list will be in the format of *[{"title": module_title, "id": module_oid, "score": cosine_similarity_score}, ...]* .. todo:: Add a check to see if the target module name is in the database. .. return:: a list of recommended modules or a dictionary an error message """ self._clean_data(self._convert_module_id, self._scalar_to_str) self._create_features() self.featureDf = self.df[["id", "features"]] self.featureDf["cleaned_features"] = self._text_processing( self.featureDf["features"] ) self._get_cosine() return self._get_recommendations()