Let us build a search engine to pull similar products for a product being viewed by the user. We shall aim to build this using the important features of the product. The different algorithms used to acheive this are:
The first four are NLP based techniques while the lat is a Deep Learning technique.
Happy Learning!
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import plotly
import pickle
data = pd.read_json('D:\\Applied_AI_Workshop_Code_Data\\tops_fashion.json')
print("Num of Data Points/Rows :", data.shape[0], "Num of Variables/Features/Columns :", data.shape[1])
print(data.columns)
data = data[['asin','brand','color','medium_image_url','product_type_name','title','formatted_price']]
print("Num of Data Points/Rows :", data.shape[0], "Num of Variables/Features/Columns :", data.shape[1])
print(data.columns)
data.head()
BASIC STATS FOR FEATURE - product_type_name
print(data['product_type_name'].describe())
print(data['product_type_name'].unique())
product_type_count=Counter(list(data['product_type_name']))
product_type_count.most_common(10)
BASIC STATS FOR FEATURE - brand
print(data['brand'].describe())
# 151 missing values
brand_count=Counter(list(data['brand']))
brand_count.most_common(10)
BASIC STATS FOR FEATURE - color
print(data['color'].describe())
#Many missing data (65% missing)
color_count=Counter(list(data['color']))
color_count.most_common(10)
BASIC STATS FOR FEATURE - formatted_price
print(data['formatted_price'].describe())
#only arond 18% has price details
price_count=Counter(list(data['formatted_price']))
price_count.most_common(10)
BASIC STATS FOR FEATURE - title
print(data['title'].describe())
title_count=Counter(list(data['title']))
title_count.most_common(10)
Storing the Entire Raw Dataset to a pickel file
data.to_pickle('D:\\Applied_AI_Workshop_Code_Data\\pickles\\180k_apparel_data')
Reduce the number of data points for faster processing
Step 1: Filter the products with no price data
data=data[~(data['formatted_price'].isnull())]
print('Num of products after eliminating Null Price :', data.shape[0])
Step 2: Filter the products with no color data
data=data[~(data['color'].isnull())]
print('Num of products after eliminating Null color :', data.shape[0])
Storing the Subset of the Raw Dataset to a pickel file
data.to_pickle('D:\\Applied_AI_Workshop_Code_Data\\pickles\\28k_apparel_data')
Let us analyze the number of duplicate titles. This may be due to same product with different versions - in color or size etc..,
print(sum(data.duplicated('title')))
Sample Products with similar titles:
title_count=Counter(list(data['title']))
title_count.most_common(10)
data[data['title']=="LJT Women's 2016 NASCAR Sprint All Star Race T-Shirt"].head()
The products under the above case will keep showing the same product in different size / color as the similar products.
But that is not what we are intending to show.
So Let us remove the duplicate rows and retain unique titles.
data.head()
data_sorted = data[data['title'].apply(lambda x:len(x.split())>4)]
print('Num of Data Points after removing the products with short description : ', data_sorted.shape[0])
Sort the Data Set on the title column to help the next steps execute faster
data_sorted.sort_values('title',inplace=True,ascending=False)
data_sorted.head()
data_sorted1=data_sorted[data_sorted['title'].str.contains("tokidoki")]
data_sorted1
The above case, first three product titles are exactky the same except last very few words.
This will not get removed even if we remove the duplicates because the last few words are different.
We have to figure out a way to remove such duplicates.
i=0
'''
while (i<(data_sorted.shape[0])):
j=i+1
while (j<(data_sorted.shape[0])):
setA = set(data_sorted.iloc[i]['title'].split()) #Title 1 - Words set
setB = set(data_sorted.iloc[j]['title'].split()) #Title 2 - Words set
if ((len(setA.difference(setB))<=2) & (len(setB.difference(setA))<=2)): # Compare the different words among the titles
data_sorted.drop(data_sorted.index[j], inplace=True) # Delete the row if only 2 or less words are different
else:
j=j+1
i=i+1
print('After removing the near duplicate title products, num of rows: ',data_sorted.shape[0])
'''
data = pd.read_pickle("D:\\Applied_AI_Workshop_Code_Data\\pickles\\16k_apperal_data")
Work on preprocessing and cleaning the Data
stop_words = set(stopwords.words('english'))
#print('list of stop words:', stop_words)
import string
def text_process(title):
nopunc = [char for char in title if char not in string.punctuation] #1. Remove punc
nopunc = ''.join(nopunc) #2. Remove stopwords
word_list=[word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')] #3. Return clean text words list
return(' '.join(word_list))
data['title']=data['title'].apply(text_process)
data['title'].head()
from nltk.stem.porter import *
stemmer = PorterStemmer()
#print(stemmer.stem('arguing'))
#print(stemmer.stem('argued'))
No Stemming on this use case because it is all mostly nouns and adjectives.
Stemming adds value mostly when the words are verbs.
def display_img(url):
response=requests.get(url)
img=Image.open(BytesIO(response.content))
plt.imshow(img)
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer=CountVectorizer()
title_features=title_vectorizer.fit_transform(data['title'])
title_features.get_shape()
def bag_of_words_model(doc_id,num_results):
pairwise_dist=pairwise_distances(title_features,title_features[doc_id])
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
print('Title : ',data['title'].loc[df_indices[i]])
display_img(url=data['medium_image_url'].loc[df_indices[i]])
bag_of_words_model(12566,5)
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])
def tf_idf_model(doc_id,num_results):
pairwise_dist=pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id])
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
print('Title : ',data['title'].loc[df_indices[i]])
display_img(url=data['medium_image_url'].loc[df_indices[i]])
tf_idf_model(12566,5)