Scalable data preparation for beer reviews analysis

sahil · March 15, 2021, 2:53pm

As this article explains, data preparation is a huge part of modern ML. The article showcases an example of data preparation on a publicly available beer reviews dataset. This process requires a lot of string based data processing. The article demonstrates how to use GPUs and libraries such as nvStrings and cudf to make this process scalable.

However, with Bodo this data processing task is much simpler and as an added benefit, you can re-use your existing pandas code.

For example, here’s a simple Pandas implementation for this data prep task:

import numpy as np
import pandas as pd
import itertools
import time

dir_path = os.path.dirname(os.path.realpath(__file__))
#Create lists of stopwords and punctuation that will be removed
with open(f"{dir_path}/nltk-stopwords.txt", "r") as fh:
    STOPWORDS = list(map(str.strip, fh.readlines()))
PUNCT_LIST = ["\.", "\-", "\?", "\:", ":", "!", "&", "'", ","]

#Define regex that will be used to remove these punctuation and stopwords from the reviews.
punc_regex = "|".join([f"({p})" for p in PUNCT_LIST])
stopword_regex = "|".join([f"\\b({s})\\b" for s in STOPWORDS])


def preprocess(reviews):
    # lowercase and strip
    reviews = reviews.str.lower()
    reviews = reviews.str.strip()

    # remove punctuation and stopwords
    reviews = reviews.str.replace(punc_regex, "", regex=True)
    reviews = reviews.str.replace(stopword_regex, "", regex=True)
    return reviews

def find_top_words(review_filename):
    # Load in the data
    t_start = time.time()
    df = pd.read_csv(review_filename, parse_dates=[2])
    print("read time", time.time() - t_start)

    score = df.score
    reviews = df.text

    t1 = time.time()
    reviews = preprocess(reviews)
    print("preprocess time", time.time() - t1)

    t1 = time.time()
    # create low and high score series
    low_threshold = 1.5
    high_threshold = 4.95
    high_reviews = reviews[score > high_threshold]
    low_reviews = reviews[score <= low_threshold]
    high_reviews = high_reviews.dropna()
    low_reviews = low_reviews.dropna()

    high_colsplit = high_reviews.str.split()
    low_colsplit = low_reviews.str.split()
    print("high/low time", time.time() - t1)

    t1 = time.time()
    high_words = high_colsplit.explode()
    low_words = low_colsplit.explode()

    top_words = high_words.value_counts().head(25)
    low_words = low_words.value_counts().head(25)
    print("value_counts time", time.time() - t1)
    print("total time", time.time() - t_start)

    print("Top words: ")
    print(top_words)
    print("Low words: ")
    print(low_words)

find_top_words("reviews_sample.csv")

Running this on a sample dataset, we get the following output:

% python -u beer-reviews.py             
read time 6.636693000793457
preprocess time 128.2600917816162
high/low time 0.015850067138671875
value_counts time 0.013387203216552734
total time 134.92668890953064
Top words: 
beer         333
one          158
taste        140
head         119
like         117
best         102
dark          90
chocolate     90
great         86
perfect       80
good          79
sweet         77
smell         73
bottle        72
ive           70
flavor        68
well          65
glass         65
ever          65
nice          64
aroma         64
malt          63
beers         62
hops          62
bourbon       62
Name: text, dtype: int64
Low words: 
beer           239
like           109
taste          104
head            69
one             65
light           65
smell           57
bad             53
bottle          52
really          49
good            41
would           40
get             38
water           35
flavor          33
beers           32
carbonation     32
much            32
smells          32
corn            31
even            31
glass           31
poured          30
tastes          29
mouthfeel       29
Name: text, dtype: int64

By adding a single bodo jit decorator, we can automatically parallelize this code:

import numpy as np
import pandas as pd
import itertools
import time
import bodo

dir_path = os.path.dirname(os.path.realpath(__file__))
#Create lists of stopwords and punctuation that will be removed
with open(f"{dir_path}/nltk-stopwords.txt", "r") as fh:
    STOPWORDS = list(map(str.strip, fh.readlines()))
PUNCT_LIST = ["\.", "\-", "\?", "\:", ":", "!", "&", "'", ","]

#Define regex that will be used to remove these punctuation and stopwords from the reviews.
punc_regex = "|".join([f"({p})" for p in PUNCT_LIST])
stopword_regex = "|".join([f"\\b({s})\\b" for s in STOPWORDS])


@bodo.jit(distributed=["reviews"])  # <--- That's all
def preprocess(reviews):
    # lowercase and strip
    reviews = reviews.str.lower()
    reviews = reviews.str.strip()

    # remove punctuation and stopwords
    reviews = reviews.str.replace(punc_regex, "", regex=True)
    reviews = reviews.str.replace(stopword_regex, "", regex=True)
    return reviews

@bodo.jit  # <--- That's all
def find_top_words(review_filename):
    # Load in the data
    t_start = time.time()
    df = pd.read_csv(review_filename, parse_dates=[2])
    print("read time", time.time() - t_start)

    score = df.score
    reviews = df.text

    t1 = time.time()
    reviews = preprocess(reviews)
    print("preprocess time", time.time() - t1)

    t1 = time.time()
    # create low and high score series
    low_threshold = 1.5
    high_threshold = 4.95
    high_reviews = reviews[score > high_threshold]
    low_reviews = reviews[score <= low_threshold]
    high_reviews = high_reviews.dropna()
    low_reviews = low_reviews.dropna()

    high_colsplit = high_reviews.str.split()
    low_colsplit = low_reviews.str.split()
    print("high/low time", time.time() - t1)

    t1 = time.time()
    high_words = high_colsplit.explode()
    low_words = low_colsplit.explode()

    top_words = high_words.value_counts().head(25)
    low_words = low_words.value_counts().head(25)
    print("value_counts time", time.time() - t1)
    print("total time", time.time() - t_start)

    print("Top words: ")
    print(top_words)
    print("Low words: ")
    print(low_words)

find_top_words("reviews_sample.csv")

For instance, running it on 4 cores on my MacBook Pro, we get the same output:

% mpiexec -n 4 python -u beer-reviews.py
read time 4.1938237880240194
preprocess time 36.57433161896188
high/low time 0.17758165003033355
value_counts time 0.016226659005042166
total time 40.9621572100441
Top words: 
beer         333
one          158
taste        140
head         119
like         117
best         102
dark          90
chocolate     90
great         86
perfect       80
good          79
sweet         77
smell         73
bottle        72
ive           70
flavor        68
well          65
glass         65
ever          65
nice          64
aroma         64
malt          63
beers         62
hops          62
bourbon       62
Name: text, dtype: int64
Low words: 
beer           239
like           109
taste          104
head            69
one             65
light           65
smell           57
bad             53
bottle          52
really          49
good            41
would           40
get             38
water           35
flavor          33
beers           32
carbonation     32
much            32
smells          32
corn            31
even            31
glass           31
poured          30
tastes          29
mouthfeel       29
Name: text, dtype: int64

This is almost 4 times faster, without having to rewrite any of the pandas code. This code can be scaled linearly to any number of cores, making this simple pandas code usable in production environments.