SIMILAR PRODUCTS SEARCH ENGINE¶

Data Science Training Session¶

Let us build a search engine to pull similar products for a product being viewed by the user. We shall aim to build this using the important features of the product. The different algorithms used to acheive this are:

Bag of Words based similarity
TF Idf Based Similarity
Idf Based Similarity
Word2Vector Based Similarity
Visual Similarity Engine - Deep Learning Technique

The first four are NLP based techniques while the lat is a Deep Learning technique.

Happy Learning!

from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import plotly
import pickle

DATA ACQUISITION¶

data = pd.read_json('D:\\Applied_AI_Workshop_Code_Data\\tops_fashion.json')
print("Num of Data Points/Rows :", data.shape[0], "Num of Variables/Features/Columns :", data.shape[1])
print(data.columns)

Num of Data Points/Rows : 183138 Num of Variables/Features/Columns : 19
Index(['asin', 'author', 'availability', 'availability_type', 'brand', 'color',
       'editorial_reivew', 'editorial_review', 'formatted_price',
       'large_image_url', 'manufacturer', 'medium_image_url', 'model',
       'product_type_name', 'publisher', 'reviews', 'sku', 'small_image_url',
       'title'],
      dtype='object')

asin - Amazon Stadard Identification Number
brand - Brand of the apparel
color - Color of the apparel
product_type_name - Product Category of the Apparel
medium image url - Link to the product Image
title - Product Title
formatted price - Cost of the Product

data = data[['asin','brand','color','medium_image_url','product_type_name','title','formatted_price']]
print("Num of Data Points/Rows :", data.shape[0], "Num of Variables/Features/Columns :", data.shape[1])
print(data.columns)

Num of Data Points/Rows : 183138 Num of Variables/Features/Columns : 7
Index(['asin', 'brand', 'color', 'medium_image_url', 'product_type_name',
       'title', 'formatted_price'],
      dtype='object')

EXPLORATORY DATA ANALYSIS¶

data.head()

BASIC STATS FOR FEATURE - product_type_name

print(data['product_type_name'].describe())

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

print(data['product_type_name'].unique())

['SHIRT' 'SWEATER' 'APPAREL' 'OUTDOOR_RECREATION_PRODUCT'
 'BOOKS_1973_AND_LATER' 'PANTS' 'HAT' 'SPORTING_GOODS' 'DRESS' 'UNDERWEAR'
 'SKIRT' 'OUTERWEAR' 'BRA' 'ACCESSORY' 'ART_SUPPLIES' 'SLEEPWEAR'
 'ORCA_SHIRT' 'HANDBAG' 'PET_SUPPLIES' 'SHOES' 'KITCHEN' 'ADULT_COSTUME'
 'HOME_BED_AND_BATH' 'MISC_OTHER' 'BLAZER' 'HEALTH_PERSONAL_CARE'
 'TOYS_AND_GAMES' 'SWIMWEAR' 'CONSUMER_ELECTRONICS' 'SHORTS' 'HOME'
 'AUTO_PART' 'OFFICE_PRODUCTS' 'ETHNIC_WEAR' 'BEAUTY'
 'INSTRUMENT_PARTS_AND_ACCESSORIES' 'POWERSPORTS_PROTECTIVE_GEAR' 'SHIRTS'
 'ABIS_APPAREL' 'AUTO_ACCESSORY' 'NONAPPARELMISC' 'TOOLS' 'BABY_PRODUCT'
 'SOCKSHOSIERY' 'POWERSPORTS_RIDING_SHIRT' 'EYEWEAR' 'SUIT'
 'OUTDOOR_LIVING' 'POWERSPORTS_RIDING_JACKET' 'HARDWARE' 'SAFETY_SUPPLY'
 'ABIS_DVD' 'VIDEO_DVD' 'GOLF_CLUB' 'MUSIC_POPULAR_VINYL'
 'HOME_FURNITURE_AND_DECOR' 'TABLET_COMPUTER' 'GUILD_ACCESSORIES'
 'ABIS_SPORTS' 'ART_AND_CRAFT_SUPPLY' 'BAG' 'MECHANICAL_COMPONENTS'
 'SOUND_AND_RECORDING_EQUIPMENT' 'COMPUTER_COMPONENT' 'JEWELRY'
 'BUILDING_MATERIAL' 'LUGGAGE' 'BABY_COSTUME' 'POWERSPORTS_VEHICLE_PART'
 'PROFESSIONAL_HEALTHCARE' 'SEEDS_AND_PLANTS' 'WIRELESS_ACCESSORY']

product_type_count=Counter(list(data['product_type_name']))
product_type_count.most_common(10)

[('SHIRT', 167794),
 ('APPAREL', 3549),
 ('BOOKS_1973_AND_LATER', 3336),
 ('DRESS', 1584),
 ('SPORTING_GOODS', 1281),
 ('SWEATER', 837),
 ('OUTERWEAR', 796),
 ('OUTDOOR_RECREATION_PRODUCT', 729),
 ('ACCESSORY', 636),
 ('UNDERWEAR', 425)]

BASIC STATS FOR FEATURE - brand

print(data['brand'].describe())
# 151 missing values

count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object

brand_count=Counter(list(data['brand']))
brand_count.most_common(10)

[('Zago', 223),
 ('XQS', 222),
 ('Yayun', 215),
 ('YUNY', 198),
 ('XiaoTianXin-women clothes', 193),
 ('Generic', 192),
 ('Boohoo', 190),
 ('Alion', 188),
 ('Abetteric', 187),
 ('TheMogan', 187)]

BASIC STATS FOR FEATURE - color

print(data['color'].describe())
#Many missing data (65% missing)

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

color_count=Counter(list(data['color']))
color_count.most_common(10)

[(None, 118182),
 ('Black', 13207),
 ('White', 8616),
 ('Blue', 3570),
 ('Red', 2289),
 ('Pink', 1842),
 ('Grey', 1499),
 ('*', 1388),
 ('Green', 1258),
 ('Multi', 1203)]

BASIC STATS FOR FEATURE - formatted_price

print(data['formatted_price'].describe())
#only arond 18% has price details

count      28395
unique      3135
top       $19.99
freq         945
Name: formatted_price, dtype: object

price_count=Counter(list(data['formatted_price']))
price_count.most_common(10)

[(None, 154743),
 ('$19.99', 945),
 ('$9.99', 749),
 ('$9.50', 601),
 ('$14.99', 472),
 ('$7.50', 463),
 ('$24.99', 414),
 ('$29.99', 370),
 ('$8.99', 343),
 ('$9.01', 336)]

BASIC STATS FOR FEATURE - title

print(data['title'].describe())

count                                                183138
unique                                               175985
top       Nakoda Cotton Self Print Straight Kurti For Women
freq                                                     77
Name: title, dtype: object

title_count=Counter(list(data['title']))
title_count.most_common(10)

[('Nakoda Cotton Self Print Straight Kurti For Women', 77),
 ("Q-rious Women's Racerback Cotton Lycra Camsioles", 56),
 ('FINEJO Casual Women Long Sleeve Lace Irregular Hem Blouse Tops', 47),
 ('Girlzwalk Women Cami Sleeveless Printed Swing Vest Top Plus Sizes', 44),
 ("ELINA FASHION Women's Indo-Western Tunic Top Cotton Kurti", 43),
 ('Victoria Scoop Neck Front Lace Floral High-Low Top in 4 Sizes', 40),
 ("Cenizas Women's Indian Tunic Top Cotton Kurti", 39),
 ('Indistar Womens Premium Cotton Half Sleeves Printed T-Shirts/Tops (Pack of 3)',
  37),
 ("Rajnandini Women's Cotton Printed Kurti", 35),
 ('Long Sleeve Mock Neck Top', 32)]

Storing the Entire Raw Dataset to a pickel file

data.to_pickle('D:\\Applied_AI_Workshop_Code_Data\\pickles\\180k_apparel_data')

Reduce the number of data points for faster processing

Step 1: Filter the products with no price data

data=data[~(data['formatted_price'].isnull())]
print('Num of products after eliminating Null Price :', data.shape[0])

Num of products after eliminating Null Price : 28395

Step 2: Filter the products with no color data

data=data[~(data['color'].isnull())]
print('Num of products after eliminating Null color :', data.shape[0])

Num of products after eliminating Null color : 28385

Storing the Subset of the Raw Dataset to a pickel file

data.to_pickle('D:\\Applied_AI_Workshop_Code_Data\\pickles\\28k_apparel_data')

Understanding Duplicate Rows¶

Let us analyze the number of duplicate titles. This may be due to same product with different versions - in color or size etc..,

print(sum(data.duplicated('title')))

2325

Sample Products with similar titles:

title_count=Counter(list(data['title']))
title_count.most_common(10)

[('FINEJO Casual Women Long Sleeve Lace Irregular Hem Blouse Tops', 47),
 ('Girlzwalk Women Cami Sleeveless Printed Swing Vest Top Plus Sizes', 43),
 ('Victoria Scoop Neck Front Lace Floral High-Low Top in 4 Sizes', 40),
 ('Long Sleeve Mock Neck Top', 31),
 ("Women's FOOTBALL REDBULL Long Sleeve T-Shirt", 30),
 ("Women's Sherlock Holmes 2 Long Sleeve T-Shirt", 30),
 ('WenHong Women Cute Sleeveless V-neck Vest Loose Tank Tops T-shirt', 28),
 ('Crazy Girls Womens Ladies JD Most Wanted Heineken Racer Back Vest Top', 28),
 ("LJT Women's 2016 NASCAR Sprint Cup AAA Texas 500 T-Shirt", 25),
 ("LJT Women's 2016 NASCAR Sprint All Star Race T-Shirt", 24)]

data[data['title']=="LJT Women's 2016 NASCAR Sprint All Star Race T-Shirt"].head()

The products under the above case will keep showing the same product in different size / color as the similar products.

But that is not what we are intending to show.

So Let us remove the duplicate rows and retain unique titles.

data.head()

data_sorted = data[data['title'].apply(lambda x:len(x.split())>4)]
print('Num of Data Points after removing the products with short description : ', data_sorted.shape[0])

Num of Data Points after removing the products with short description :  27949

Sort the Data Set on the title column to help the next steps execute faster

data_sorted.sort_values('title',inplace=True,ascending=False)
data_sorted.head()

data_sorted1=data_sorted[data_sorted['title'].str.contains("tokidoki")]
data_sorted1

The above case, first three product titles are exactky the same except last very few words.

This will not get removed even if we remove the duplicates because the last few words are different.

We have to figure out a way to remove such duplicates.

i=0
'''
while (i<(data_sorted.shape[0])):
    j=i+1
    while (j<(data_sorted.shape[0])):
      setA = set(data_sorted.iloc[i]['title'].split())  #Title 1 - Words set
      setB = set(data_sorted.iloc[j]['title'].split())  #Title 2 - Words set
      if ((len(setA.difference(setB))<=2) & (len(setB.difference(setA))<=2)):  # Compare the different words among the titles
            data_sorted.drop(data_sorted.index[j], inplace=True)             # Delete the row if only 2 or less words are different
      else:
            j=j+1
    i=i+1
    
print('After removing the near duplicate title products, num of rows: ',data_sorted.shape[0])
'''

"\nwhile (i<(data_sorted.shape[0])):\n    j=i+1\n    while (j<(data_sorted.shape[0])):\n      setA = set(data_sorted.iloc[i]['title'].split())  #Title 1 - Words set\n      setB = set(data_sorted.iloc[j]['title'].split())  #Title 2 - Words set\n      if ((len(setA.difference(setB))<=2) & (len(setB.difference(setA))<=2)):  # Compare the different words among the titles\n            data_sorted.drop(data_sorted.index[j], inplace=True)             # Delete the row if only 2 or less words are different\n      else:\n            j=j+1\n    i=i+1\n    \nprint('After removing the near duplicate title products, num of rows: ',data_sorted.shape[0])\n"

data = pd.read_pickle("D:\\Applied_AI_Workshop_Code_Data\\pickles\\16k_apperal_data")

TEXT PREPROCESSING¶

Work on preprocessing and cleaning the Data

Removal of Punctuation
Removal of stop words
Convert all teh words to lower case

stop_words = set(stopwords.words('english'))
#print('list of stop words:', stop_words)

import string

def text_process(title):
    nopunc = [char for char in title if char not in string.punctuation]   #1. Remove punc
    nopunc = ''.join(nopunc)                                             #2. Remove stopwords
    word_list=[word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]  #3. Return clean text words  list
    return(' '.join(word_list))


data['title']=data['title'].apply(text_process)

data['title'].head()

4     featherlite ladies long sleeve stain resistant...
6     womens unique 100 cotton special olympics worl...
15    featherlite ladies moisture free mesh sport sh...
27    supernatural chibis sam dean castiel neck tshi...
46    fifth degree womens gold foil graphic tees jun...
Name: title, dtype: object

Stemming of the words -
Converting the words to the root words to surpass the different forms of words

from nltk.stem.porter import *
stemmer = PorterStemmer()
#print(stemmer.stem('arguing'))
#print(stemmer.stem('argued'))

No Stemming on this use case because it is all mostly nouns and adjectives.

Stemming adds value mostly when the words are verbs.

Solution 1 : Bag Of Words Based Product Similarity¶

def display_img(url):
    response=requests.get(url)
    img=Image.open(BytesIO(response.content))
    plt.imshow(img)
    plt.show()

from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer=CountVectorizer()
title_features=title_vectorizer.fit_transform(data['title'])
title_features.get_shape()

(16042, 12611)

def bag_of_words_model(doc_id,num_results):
    pairwise_dist=pairwise_distances(title_features,title_features[doc_id])
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(data.index[indices])
    
    for i in range(0,len(indices)):
        print('Title : ',data['title'].loc[df_indices[i]])
        display_img(url=data['medium_image_url'].loc[df_indices[i]])

bag_of_words_model(12566,5)

Title :  burnt umber tiger tshirt zebra stripes xl xxl

Title :  pink tiger tshirt zebra stripes xl xxl

Title :  brown white tiger tshirt tiger stripes xl xxl

Title :  yellow tiger tshirt tiger stripes l

Title :  ideology graphic tshirt xl white

Solution 2 : Tf IDF Based Product Similarity¶

tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])

def tf_idf_model(doc_id,num_results):
    pairwise_dist=pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id])
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(data.index[indices])
    
    for i in range(0,len(indices)):
        print('Title : ',data['title'].loc[df_indices[i]])
        display_img(url=data['medium_image_url'].loc[df_indices[i]])

tf_idf_model(12566,5)

Title :  burnt umber tiger tshirt zebra stripes xl xxl

Title :  pink tiger tshirt zebra stripes xl xxl

Title :  brown white tiger tshirt tiger stripes xl xxl

Title :  grey white tiger tank top tiger stripes xl xxl

Title :  yellow tiger tshirt tiger stripes l

	asin	brand	color	medium_image_url	product_type_name	title	formatted_price
0	B016I2TS4W	FNC7C	None	https://images-na.ssl-images-amazon.com/images...	SHIRT	Minions Como Superheroes Ironman Long Sleeve R...	None
1	B01N49AI08	FIG Clothing	None	https://images-na.ssl-images-amazon.com/images...	SHIRT	FIG Clothing Womens Izo Tunic	None
2	B01JDPCOHO	FIG Clothing	None	https://images-na.ssl-images-amazon.com/images...	SHIRT	FIG Clothing Womens Won Top	None
3	B01N19U5H5	Focal18	None	https://images-na.ssl-images-amazon.com/images...	SHIRT	Focal18 Sailor Collar Bubble Sleeve Blouse Shi...	None
4	B004GSI2OS	FeatherLite	Onyx Black/ Stone	https://images-na.ssl-images-amazon.com/images...	SHIRT	Featherlite Ladies' Long Sleeve Stain Resistan...	$26.26

	asin	brand	color	medium_image_url	product_type_name	title	formatted_price
69408	B01IBEZ9AM	LaJiTong	Purple	https://images-na.ssl-images-amazon.com/images...	BOOKS_1973_AND_LATER	LJT Women's 2016 NASCAR Sprint All Star Race T...	$9.50
78269	B01IBEYWFA	LaJiTong	Red	https://images-na.ssl-images-amazon.com/images...	BOOKS_1973_AND_LATER	LJT Women's 2016 NASCAR Sprint All Star Race T...	$9.49
81411	B01IBEYH1Y	LaJiTong	Black	https://images-na.ssl-images-amazon.com/images...	BOOKS_1973_AND_LATER	LJT Women's 2016 NASCAR Sprint All Star Race T...	$9.49
86237	B01IBEYUKC	LaJiTong	Red	https://images-na.ssl-images-amazon.com/images...	BOOKS_1973_AND_LATER	LJT Women's 2016 NASCAR Sprint All Star Race T...	$9.49
87800	B01IBEYKO8	LaJiTong	Black	https://images-na.ssl-images-amazon.com/images...	BOOKS_1973_AND_LATER	LJT Women's 2016 NASCAR Sprint All Star Race T...	$9.49

	asin	brand	color	medium_image_url	product_type_name	title	formatted_price
4	B004GSI2OS	FeatherLite	Onyx Black/ Stone	https://images-na.ssl-images-amazon.com/images...	SHIRT	Featherlite Ladies' Long Sleeve Stain Resistan...	$26.26
6	B012YX2ZPI	HX-Kingdom Fashion T-shirts	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	Women's Unique 100% Cotton T - Special Olympic...	$9.99
11	B001LOUGE4	Fitness Etc.	Black	https://images-na.ssl-images-amazon.com/images...	SHIRT	Ladies Cotton Tank 2x1 Ribbed Tank Top	$11.99
15	B003BSRPB0	FeatherLite	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	FeatherLite Ladies' Moisture Free Mesh Sport S...	$20.54
21	B014ICEDNA	FNC7C	Purple	https://images-na.ssl-images-amazon.com/images...	SHIRT	Supernatural Chibis Sam Dean And Castiel Short...	$7.50

	asin	brand	color	medium_image_url	product_type_name	title	formatted_price
61973	B06Y1KZ2WB	Éclair	Black/Pink	https://images-na.ssl-images-amazon.com/images...	SHIRT	Éclair Women's Printed Thin Strap Blouse Black...	$24.99
133820	B010RV33VE	xiaoming	Pink	https://images-na.ssl-images-amazon.com/images...	SHIRT	xiaoming Womens Sleeveless Loose Long T-shirts...	$18.19
81461	B01DDSDLNS	xiaoming	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	xiaoming Women's White Long Sleeve Single Brea...	$21.58
75995	B00X5LYO9Y	xiaoming	Red Anchors	https://images-na.ssl-images-amazon.com/images...	SHIRT	xiaoming Stripes Tank Patch/Bear Sleeve Anchor...	$15.91
151570	B00WPJG35K	xiaoming	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	xiaoming Sleeve Sheer Loose Tassel Kimono Woma...	$14.32

	asin	brand	color	medium_image_url	product_type_name	title	formatted_price
139484	B007TVCWO0	Tokidoki	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	tokidoki The Queen of Diamonds Women's Shirt X...	$23.00
92382	B007TVCG3W	Tokidoki	Yellow	https://images-na.ssl-images-amazon.com/images...	SHIRT	tokidoki Skateboard Latte Women's Shirt Medium	$23.00
93518	B007TVCVT6	Tokidoki	Pink	https://images-na.ssl-images-amazon.com/images...	SHIRT	tokidoki Ninja Dog Womens Shirt Small	$23.00
101522	B007TVDEG0	Tokidoki	Grey	https://images-na.ssl-images-amazon.com/images...	SHIRT	tokidoki Harmony Women's Shirt Small	$23.00
74087	B007TVDOWO	Tokidoki	White	https://images-na.ssl-images-amazon.com/images...	SHIRT	tokidoki Chica Polka Dot Women's Tank Top Medium	$23.00