I am able to generate the Cosine Similarity using Pandas dataframe. But, I need to do it on Pyspark using spark dataframe. I didn't find any relevant info over google and I am new with Pyspark so reaching out.
corpus = [
"Hello there good man!",
"It is quite windy in ",
"How is the weather today?",
"How isssss ay?",
"windy London is where"
]
query = "windy London"
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print(tfidf_matrix.shape)
#Optional to see the data
doc_term_matrix = tfidf_matrix.todense()
df = pd.DataFrame(doc_term_matrix,
columns=tfidf_vectorizer.get_feature_names())
#Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
queryTFIDF = TfidfVectorizer().fit(corpus)
queryTFIDF = queryTFIDF.transform([query])
cosine_similarities = cosine_similarity(queryTFIDF, tfidf_matrix).flatten()
related_product_indices = cosine_similarities.argsort()[-1] #For first prediction