Module emse-mms.recommend
Expand source code
# get a set of modules the user has reviewed
# get the rating by the user for each module
# multiply the rating by the weight of the module
# divide the sum of the weighted ratings by the sum of the ratings
import asyncio
import random
import pandas as pd
from math import sqrt
from pandas import DataFrame
from prisma import Prisma
async def main(target: str):
# rec = Recommender(target)
# await rec.recommend()
rec = Recs()
await rec.run()
class Recommender:
def __init__(self, target: str):
self.target = target
self.sample = []
self.modules = []
self.recs = []
async def _sampleModules(self, skip: bool = True):
prisma = Prisma()
await prisma.connect()
modules = await prisma.module.find_many(include={"feedback": True})
modules = list(map(lambda x: x.dict(), modules))
rand = random.sample(modules, len(modules) // 2)
# create module feedback for sample modules for target user
if not skip:
await self._seedTargetFeedback(rand)
await prisma.disconnect()
self.sample = rand
async def _seedTargetFeedback(self, sample: list):
prisma = Prisma()
await prisma.connect()
for module in sample:
await prisma.modulefeedback.create(
data={
"feedback": "This is a sample review",
"rating": random.randint(1, 5),
"module": {"connect": {"id": module}},
"student": {"connect": {"id": self.target}},
}
)
print(f"Created feedback for module {module}")
await prisma.disconnect()
async def recommend(self):
"""
At this point our target user has reviewed half of the modules in the database.
We have access to the modules that the user has reviewed through the modules parameter, and now
we can now get recommendations for the user
"""
prism = Prisma()
await prism.connect()
await self._sampleModules(skip=True)
df = pd.DataFrame(self.sample)
# convert our review data into a user x module matrix
# find cosine similarity between the target user and all other users
# get the top 5 users with the highest similarity
# get the modules that the top 5 users have reviewed
# get the modules that the target user has not reviewed
# get the modules that the top 5 users have reviewed that the target user has not reviewed
# get the average rating for each module
# get the top 5 modules with the highest average rating
# return the top 5 modules with the highest average rating as recommendations
await prism.disconnect()
class Recs:
def __init__(self, target=None):
self.prisma = Prisma()
if target is None:
target = [
{"id": "63f4ee98ece0495cbb312604", "title": "orm,", "rating": 5},
{"id": "63f4ee98ece0495cbb312608", "title": "me", "rating": 3.5},
{"id": "63f4ee98ece0495cbb3125f5", "title": "2017", "rating": 2},
{"id": "63f4ee98ece0495cbb3125f9", "title": "Souppe", "rating": 5},
{"id": "63f4ee98ece0495cbb3125fe", "title": "Frams.", "rating": 4.5},
]
self.inputMovies = pd.DataFrame(target)
self.movies_df = pd.read_csv("input/movies.csv")
self.modules_df = None
self.ratings_df = pd.read_csv("input/ratings.csv")
self.feedbacks_df = None
self.userSubsetGroup = None
self.pearsonCorrelationDict = dict()
self.tempTopUsersRating = None
def sampleModules(self):
"""
- get a random set of 10 modules
- create a ratings for each module
- return as a list of dicts with id, title and rating
"""
def cleanData(self):
"""
Removes all the columns that are not needed for the recommendation engine. This is done to reduce the size of
the dataset and reduce overall complexity in our data.
"""
modules_df: DataFrame = self.modules_df.drop(
[
"description",
"duration",
"intro",
"numSlides",
"keywords",
"objectives",
"createdAt",
"updatedAt",
"members",
"assignments",
"parentModules",
"parentModuleIDs",
"subModules",
"subModuleIDs",
"collections",
"course",
"courseIDs",
"feedback",
"moduleName",
],
axis=1,
)
feedbacks_df: DataFrame = self.feedbacks_df.drop(["student", "module"], axis=1)
self.feedbacks_df = feedbacks_df
self.modules_df = modules_df
pd.options.display.max_columns = 60
async def __get_module_data(self):
await self.prisma.connect()
modules = await self.prisma.module.find_many()
modules = list(map(lambda x: x.dict(), modules))
self.modules_df = pd.DataFrame(modules)
await self.prisma.disconnect()
async def __get_feedback_data(self):
await self.prisma.connect()
feedbacks = await self.prisma.modulefeedback.find_many()
feedbacks = list(map(lambda x: x.dict(), feedbacks))
self.feedbacks_df = pd.DataFrame(feedbacks)
await self.prisma.disconnect()
def handleUserInput(self):
inputID = self.modules_df[
self.modules_df["id"].isin(self.inputMovies["id"].tolist())
]
inputMovies = pd.merge(inputID, self.inputMovies)
self.inputMovies = inputMovies
def createSubset(self):
userSubset = self.feedbacks_df[
self.feedbacks_df["moduleId"].isin(self.inputMovies["id"].tolist())
]
userSubsetGroup = userSubset.groupby(["studentId"])
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)
self.userSubsetGroup = userSubsetGroup
def createSimilarityMatrix(self):
pearsonCorrelationDict = {}
for name, group in self.userSubsetGroup:
group = group.sort_values(by="id")
inputMovies = self.inputMovies.sort_values(by="rating")
nRatings = len(group)
temp_df = inputMovies[inputMovies["id"].isin(group["moduleId"].tolist())]
tempRatingList = temp_df["rating"].tolist()
tempGroupList = group["rating"].tolist()
Sxx = sum([i**2 for i in tempRatingList]) - pow(
sum(tempRatingList), 2
) / float(nRatings)
Syy = sum([i**2 for i in tempGroupList]) - pow(
sum(tempGroupList), 2
) / float(nRatings)
Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum(
tempRatingList
) * sum(tempGroupList) / float(nRatings)
if Sxx != 0 and Syy != 0:
pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy)
else:
pearsonCorrelationDict[name] = 0
self.pearsonCorrelationDict = pearsonCorrelationDict
def topUser(self):
pearsonDF = pd.DataFrame.from_dict(self.pearsonCorrelationDict, orient="index")
pearsonDF.columns = ["similarityIndex"]
pearsonDF["studentId"] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
topUsers = pearsonDF.sort_values(by="similarityIndex", ascending=False)[0:50]
print(topUsers.head())
topUsersRating = topUsers.merge(
self.feedbacks_df, left_on="studentId", right_on="studentId", how="inner"
)
topUsersRating["weightedRating"] = (
topUsersRating["similarityIndex"] * topUsersRating["rating"]
)
tempTopUsersRating = topUsersRating.groupby("moduleId").sum()[
["similarityIndex", "weightedRating"]
]
tempTopUsersRating.columns = ["sum_similarityIndex", "sum_weightedRating"]
self.tempTopUsersRating = tempTopUsersRating
def recommend(self):
recommendation_df = pd.DataFrame()
recommendation_df["w-AVG score"] = (
self.tempTopUsersRating["sum_weightedRating"]
/ self.tempTopUsersRating["sum_similarityIndex"]
)
recommendation_df["moduleId"] = self.tempTopUsersRating.index
recommendation_df = recommendation_df.sort_values(
by="w-AVG score", ascending=False
)
print(recommendation_df.head(10))
mods_df = self.modules_df.loc[
self.modules_df["id"].isin(recommendation_df.head(20)["moduleId"].tolist())
]
self.modules_df = mods_df
def convertResultToJSON(self):
modules = self.modules_df
return modules.to_json(orient="records")
async def run(self):
await self.__get_module_data()
await self.__get_feedback_data()
self.cleanData()
self.handleUserInput()
self.createSubset()
self.createSimilarityMatrix()
self.topUser()
self.recommend()
return self.convertResultToJSON()
if __name__ == "__main__":
asyncio.run(main(target="63f7a3068b546b91eadb20a6"))
Functions
async def main(target: str)
-
Expand source code
async def main(target: str): # rec = Recommender(target) # await rec.recommend() rec = Recs() await rec.run()
Classes
class Recommender (target: str)
-
Expand source code
class Recommender: def __init__(self, target: str): self.target = target self.sample = [] self.modules = [] self.recs = [] async def _sampleModules(self, skip: bool = True): prisma = Prisma() await prisma.connect() modules = await prisma.module.find_many(include={"feedback": True}) modules = list(map(lambda x: x.dict(), modules)) rand = random.sample(modules, len(modules) // 2) # create module feedback for sample modules for target user if not skip: await self._seedTargetFeedback(rand) await prisma.disconnect() self.sample = rand async def _seedTargetFeedback(self, sample: list): prisma = Prisma() await prisma.connect() for module in sample: await prisma.modulefeedback.create( data={ "feedback": "This is a sample review", "rating": random.randint(1, 5), "module": {"connect": {"id": module}}, "student": {"connect": {"id": self.target}}, } ) print(f"Created feedback for module {module}") await prisma.disconnect() async def recommend(self): """ At this point our target user has reviewed half of the modules in the database. We have access to the modules that the user has reviewed through the modules parameter, and now we can now get recommendations for the user """ prism = Prisma() await prism.connect() await self._sampleModules(skip=True) df = pd.DataFrame(self.sample) # convert our review data into a user x module matrix # find cosine similarity between the target user and all other users # get the top 5 users with the highest similarity # get the modules that the top 5 users have reviewed # get the modules that the target user has not reviewed # get the modules that the top 5 users have reviewed that the target user has not reviewed # get the average rating for each module # get the top 5 modules with the highest average rating # return the top 5 modules with the highest average rating as recommendations await prism.disconnect()
Methods
async def recommend(self)
-
At this point our target user has reviewed half of the modules in the database. We have access to the modules that the user has reviewed through the modules parameter, and now we can now get recommendations for the user
Expand source code
async def recommend(self): """ At this point our target user has reviewed half of the modules in the database. We have access to the modules that the user has reviewed through the modules parameter, and now we can now get recommendations for the user """ prism = Prisma() await prism.connect() await self._sampleModules(skip=True) df = pd.DataFrame(self.sample) # convert our review data into a user x module matrix # find cosine similarity between the target user and all other users # get the top 5 users with the highest similarity # get the modules that the top 5 users have reviewed # get the modules that the target user has not reviewed # get the modules that the top 5 users have reviewed that the target user has not reviewed # get the average rating for each module # get the top 5 modules with the highest average rating # return the top 5 modules with the highest average rating as recommendations await prism.disconnect()
class Recs (target=None)
-
Expand source code
class Recs: def __init__(self, target=None): self.prisma = Prisma() if target is None: target = [ {"id": "63f4ee98ece0495cbb312604", "title": "orm,", "rating": 5}, {"id": "63f4ee98ece0495cbb312608", "title": "me", "rating": 3.5}, {"id": "63f4ee98ece0495cbb3125f5", "title": "2017", "rating": 2}, {"id": "63f4ee98ece0495cbb3125f9", "title": "Souppe", "rating": 5}, {"id": "63f4ee98ece0495cbb3125fe", "title": "Frams.", "rating": 4.5}, ] self.inputMovies = pd.DataFrame(target) self.movies_df = pd.read_csv("input/movies.csv") self.modules_df = None self.ratings_df = pd.read_csv("input/ratings.csv") self.feedbacks_df = None self.userSubsetGroup = None self.pearsonCorrelationDict = dict() self.tempTopUsersRating = None def sampleModules(self): """ - get a random set of 10 modules - create a ratings for each module - return as a list of dicts with id, title and rating """ def cleanData(self): """ Removes all the columns that are not needed for the recommendation engine. This is done to reduce the size of the dataset and reduce overall complexity in our data. """ modules_df: DataFrame = self.modules_df.drop( [ "description", "duration", "intro", "numSlides", "keywords", "objectives", "createdAt", "updatedAt", "members", "assignments", "parentModules", "parentModuleIDs", "subModules", "subModuleIDs", "collections", "course", "courseIDs", "feedback", "moduleName", ], axis=1, ) feedbacks_df: DataFrame = self.feedbacks_df.drop(["student", "module"], axis=1) self.feedbacks_df = feedbacks_df self.modules_df = modules_df pd.options.display.max_columns = 60 async def __get_module_data(self): await self.prisma.connect() modules = await self.prisma.module.find_many() modules = list(map(lambda x: x.dict(), modules)) self.modules_df = pd.DataFrame(modules) await self.prisma.disconnect() async def __get_feedback_data(self): await self.prisma.connect() feedbacks = await self.prisma.modulefeedback.find_many() feedbacks = list(map(lambda x: x.dict(), feedbacks)) self.feedbacks_df = pd.DataFrame(feedbacks) await self.prisma.disconnect() def handleUserInput(self): inputID = self.modules_df[ self.modules_df["id"].isin(self.inputMovies["id"].tolist()) ] inputMovies = pd.merge(inputID, self.inputMovies) self.inputMovies = inputMovies def createSubset(self): userSubset = self.feedbacks_df[ self.feedbacks_df["moduleId"].isin(self.inputMovies["id"].tolist()) ] userSubsetGroup = userSubset.groupby(["studentId"]) userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True) self.userSubsetGroup = userSubsetGroup def createSimilarityMatrix(self): pearsonCorrelationDict = {} for name, group in self.userSubsetGroup: group = group.sort_values(by="id") inputMovies = self.inputMovies.sort_values(by="rating") nRatings = len(group) temp_df = inputMovies[inputMovies["id"].isin(group["moduleId"].tolist())] tempRatingList = temp_df["rating"].tolist() tempGroupList = group["rating"].tolist() Sxx = sum([i**2 for i in tempRatingList]) - pow( sum(tempRatingList), 2 ) / float(nRatings) Syy = sum([i**2 for i in tempGroupList]) - pow( sum(tempGroupList), 2 ) / float(nRatings) Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum( tempRatingList ) * sum(tempGroupList) / float(nRatings) if Sxx != 0 and Syy != 0: pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy) else: pearsonCorrelationDict[name] = 0 self.pearsonCorrelationDict = pearsonCorrelationDict def topUser(self): pearsonDF = pd.DataFrame.from_dict(self.pearsonCorrelationDict, orient="index") pearsonDF.columns = ["similarityIndex"] pearsonDF["studentId"] = pearsonDF.index pearsonDF.index = range(len(pearsonDF)) topUsers = pearsonDF.sort_values(by="similarityIndex", ascending=False)[0:50] print(topUsers.head()) topUsersRating = topUsers.merge( self.feedbacks_df, left_on="studentId", right_on="studentId", how="inner" ) topUsersRating["weightedRating"] = ( topUsersRating["similarityIndex"] * topUsersRating["rating"] ) tempTopUsersRating = topUsersRating.groupby("moduleId").sum()[ ["similarityIndex", "weightedRating"] ] tempTopUsersRating.columns = ["sum_similarityIndex", "sum_weightedRating"] self.tempTopUsersRating = tempTopUsersRating def recommend(self): recommendation_df = pd.DataFrame() recommendation_df["w-AVG score"] = ( self.tempTopUsersRating["sum_weightedRating"] / self.tempTopUsersRating["sum_similarityIndex"] ) recommendation_df["moduleId"] = self.tempTopUsersRating.index recommendation_df = recommendation_df.sort_values( by="w-AVG score", ascending=False ) print(recommendation_df.head(10)) mods_df = self.modules_df.loc[ self.modules_df["id"].isin(recommendation_df.head(20)["moduleId"].tolist()) ] self.modules_df = mods_df def convertResultToJSON(self): modules = self.modules_df return modules.to_json(orient="records") async def run(self): await self.__get_module_data() await self.__get_feedback_data() self.cleanData() self.handleUserInput() self.createSubset() self.createSimilarityMatrix() self.topUser() self.recommend() return self.convertResultToJSON()
Methods
def cleanData(self)
-
Removes all the columns that are not needed for the recommendation engine. This is done to reduce the size of the dataset and reduce overall complexity in our data.
Expand source code
def cleanData(self): """ Removes all the columns that are not needed for the recommendation engine. This is done to reduce the size of the dataset and reduce overall complexity in our data. """ modules_df: DataFrame = self.modules_df.drop( [ "description", "duration", "intro", "numSlides", "keywords", "objectives", "createdAt", "updatedAt", "members", "assignments", "parentModules", "parentModuleIDs", "subModules", "subModuleIDs", "collections", "course", "courseIDs", "feedback", "moduleName", ], axis=1, ) feedbacks_df: DataFrame = self.feedbacks_df.drop(["student", "module"], axis=1) self.feedbacks_df = feedbacks_df self.modules_df = modules_df pd.options.display.max_columns = 60
def convertResultToJSON(self)
-
Expand source code
def convertResultToJSON(self): modules = self.modules_df return modules.to_json(orient="records")
def createSimilarityMatrix(self)
-
Expand source code
def createSimilarityMatrix(self): pearsonCorrelationDict = {} for name, group in self.userSubsetGroup: group = group.sort_values(by="id") inputMovies = self.inputMovies.sort_values(by="rating") nRatings = len(group) temp_df = inputMovies[inputMovies["id"].isin(group["moduleId"].tolist())] tempRatingList = temp_df["rating"].tolist() tempGroupList = group["rating"].tolist() Sxx = sum([i**2 for i in tempRatingList]) - pow( sum(tempRatingList), 2 ) / float(nRatings) Syy = sum([i**2 for i in tempGroupList]) - pow( sum(tempGroupList), 2 ) / float(nRatings) Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum( tempRatingList ) * sum(tempGroupList) / float(nRatings) if Sxx != 0 and Syy != 0: pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy) else: pearsonCorrelationDict[name] = 0 self.pearsonCorrelationDict = pearsonCorrelationDict
def createSubset(self)
-
Expand source code
def createSubset(self): userSubset = self.feedbacks_df[ self.feedbacks_df["moduleId"].isin(self.inputMovies["id"].tolist()) ] userSubsetGroup = userSubset.groupby(["studentId"]) userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True) self.userSubsetGroup = userSubsetGroup
def handleUserInput(self)
-
Expand source code
def handleUserInput(self): inputID = self.modules_df[ self.modules_df["id"].isin(self.inputMovies["id"].tolist()) ] inputMovies = pd.merge(inputID, self.inputMovies) self.inputMovies = inputMovies
def recommend(self)
-
Expand source code
def recommend(self): recommendation_df = pd.DataFrame() recommendation_df["w-AVG score"] = ( self.tempTopUsersRating["sum_weightedRating"] / self.tempTopUsersRating["sum_similarityIndex"] ) recommendation_df["moduleId"] = self.tempTopUsersRating.index recommendation_df = recommendation_df.sort_values( by="w-AVG score", ascending=False ) print(recommendation_df.head(10)) mods_df = self.modules_df.loc[ self.modules_df["id"].isin(recommendation_df.head(20)["moduleId"].tolist()) ] self.modules_df = mods_df
async def run(self)
-
Expand source code
async def run(self): await self.__get_module_data() await self.__get_feedback_data() self.cleanData() self.handleUserInput() self.createSubset() self.createSimilarityMatrix() self.topUser() self.recommend() return self.convertResultToJSON()
def sampleModules(self)
-
- get a random set of 10 modules
- create a ratings for each module
- return as a list of dicts with id, title and rating
Expand source code
def sampleModules(self): """ - get a random set of 10 modules - create a ratings for each module - return as a list of dicts with id, title and rating """
def topUser(self)
-
Expand source code
def topUser(self): pearsonDF = pd.DataFrame.from_dict(self.pearsonCorrelationDict, orient="index") pearsonDF.columns = ["similarityIndex"] pearsonDF["studentId"] = pearsonDF.index pearsonDF.index = range(len(pearsonDF)) topUsers = pearsonDF.sort_values(by="similarityIndex", ascending=False)[0:50] print(topUsers.head()) topUsersRating = topUsers.merge( self.feedbacks_df, left_on="studentId", right_on="studentId", how="inner" ) topUsersRating["weightedRating"] = ( topUsersRating["similarityIndex"] * topUsersRating["rating"] ) tempTopUsersRating = topUsersRating.groupby("moduleId").sum()[ ["similarityIndex", "weightedRating"] ] tempTopUsersRating.columns = ["sum_similarityIndex", "sum_weightedRating"] self.tempTopUsersRating = tempTopUsersRating