As this article explains, data preparation is a huge part of modern ML. The article showcases an example of data preparation on a publicly available beer reviews dataset. This process requires a lot of string based data processing. The article demonstrates how to use GPUs and libraries such as nvStrings
and cudf
to make this process scalable.
However, with Bodo this data processing task is much simpler and as an added benefit, you can re-use your existing pandas code.
For example, here’s a simple Pandas implementation for this data prep task:
import numpy as np
import pandas as pd
import itertools
import time
dir_path = os.path.dirname(os.path.realpath(__file__))
#Create lists of stopwords and punctuation that will be removed
with open(f"{dir_path}/nltk-stopwords.txt", "r") as fh:
STOPWORDS = list(map(str.strip, fh.readlines()))
PUNCT_LIST = ["\.", "\-", "\?", "\:", ":", "!", "&", "'", ","]
#Define regex that will be used to remove these punctuation and stopwords from the reviews.
punc_regex = "|".join([f"({p})" for p in PUNCT_LIST])
stopword_regex = "|".join([f"\\b({s})\\b" for s in STOPWORDS])
def preprocess(reviews):
# lowercase and strip
reviews = reviews.str.lower()
reviews = reviews.str.strip()
# remove punctuation and stopwords
reviews = reviews.str.replace(punc_regex, "", regex=True)
reviews = reviews.str.replace(stopword_regex, "", regex=True)
return reviews
def find_top_words(review_filename):
# Load in the data
t_start = time.time()
df = pd.read_csv(review_filename, parse_dates=[2])
print("read time", time.time() - t_start)
score = df.score
reviews = df.text
t1 = time.time()
reviews = preprocess(reviews)
print("preprocess time", time.time() - t1)
t1 = time.time()
# create low and high score series
low_threshold = 1.5
high_threshold = 4.95
high_reviews = reviews[score > high_threshold]
low_reviews = reviews[score <= low_threshold]
high_reviews = high_reviews.dropna()
low_reviews = low_reviews.dropna()
high_colsplit = high_reviews.str.split()
low_colsplit = low_reviews.str.split()
print("high/low time", time.time() - t1)
t1 = time.time()
high_words = high_colsplit.explode()
low_words = low_colsplit.explode()
top_words = high_words.value_counts().head(25)
low_words = low_words.value_counts().head(25)
print("value_counts time", time.time() - t1)
print("total time", time.time() - t_start)
print("Top words: ")
print(top_words)
print("Low words: ")
print(low_words)
find_top_words("reviews_sample.csv")
Running this on a sample dataset, we get the following output:
% python -u beer-reviews.py
read time 6.636693000793457
preprocess time 128.2600917816162
high/low time 0.015850067138671875
value_counts time 0.013387203216552734
total time 134.92668890953064
Top words:
beer 333
one 158
taste 140
head 119
like 117
best 102
dark 90
chocolate 90
great 86
perfect 80
good 79
sweet 77
smell 73
bottle 72
ive 70
flavor 68
well 65
glass 65
ever 65
nice 64
aroma 64
malt 63
beers 62
hops 62
bourbon 62
Name: text, dtype: int64
Low words:
beer 239
like 109
taste 104
head 69
one 65
light 65
smell 57
bad 53
bottle 52
really 49
good 41
would 40
get 38
water 35
flavor 33
beers 32
carbonation 32
much 32
smells 32
corn 31
even 31
glass 31
poured 30
tastes 29
mouthfeel 29
Name: text, dtype: int64
By adding a single bodo jit decorator, we can automatically parallelize this code:
import numpy as np
import pandas as pd
import itertools
import time
import bodo
dir_path = os.path.dirname(os.path.realpath(__file__))
#Create lists of stopwords and punctuation that will be removed
with open(f"{dir_path}/nltk-stopwords.txt", "r") as fh:
STOPWORDS = list(map(str.strip, fh.readlines()))
PUNCT_LIST = ["\.", "\-", "\?", "\:", ":", "!", "&", "'", ","]
#Define regex that will be used to remove these punctuation and stopwords from the reviews.
punc_regex = "|".join([f"({p})" for p in PUNCT_LIST])
stopword_regex = "|".join([f"\\b({s})\\b" for s in STOPWORDS])
@bodo.jit(distributed=["reviews"]) # <--- That's all
def preprocess(reviews):
# lowercase and strip
reviews = reviews.str.lower()
reviews = reviews.str.strip()
# remove punctuation and stopwords
reviews = reviews.str.replace(punc_regex, "", regex=True)
reviews = reviews.str.replace(stopword_regex, "", regex=True)
return reviews
@bodo.jit # <--- That's all
def find_top_words(review_filename):
# Load in the data
t_start = time.time()
df = pd.read_csv(review_filename, parse_dates=[2])
print("read time", time.time() - t_start)
score = df.score
reviews = df.text
t1 = time.time()
reviews = preprocess(reviews)
print("preprocess time", time.time() - t1)
t1 = time.time()
# create low and high score series
low_threshold = 1.5
high_threshold = 4.95
high_reviews = reviews[score > high_threshold]
low_reviews = reviews[score <= low_threshold]
high_reviews = high_reviews.dropna()
low_reviews = low_reviews.dropna()
high_colsplit = high_reviews.str.split()
low_colsplit = low_reviews.str.split()
print("high/low time", time.time() - t1)
t1 = time.time()
high_words = high_colsplit.explode()
low_words = low_colsplit.explode()
top_words = high_words.value_counts().head(25)
low_words = low_words.value_counts().head(25)
print("value_counts time", time.time() - t1)
print("total time", time.time() - t_start)
print("Top words: ")
print(top_words)
print("Low words: ")
print(low_words)
find_top_words("reviews_sample.csv")
For instance, running it on 4 cores on my MacBook Pro, we get the same output:
% mpiexec -n 4 python -u beer-reviews.py
read time 4.1938237880240194
preprocess time 36.57433161896188
high/low time 0.17758165003033355
value_counts time 0.016226659005042166
total time 40.9621572100441
Top words:
beer 333
one 158
taste 140
head 119
like 117
best 102
dark 90
chocolate 90
great 86
perfect 80
good 79
sweet 77
smell 73
bottle 72
ive 70
flavor 68
well 65
glass 65
ever 65
nice 64
aroma 64
malt 63
beers 62
hops 62
bourbon 62
Name: text, dtype: int64
Low words:
beer 239
like 109
taste 104
head 69
one 65
light 65
smell 57
bad 53
bottle 52
really 49
good 41
would 40
get 38
water 35
flavor 33
beers 32
carbonation 32
much 32
smells 32
corn 31
even 31
glass 31
poured 30
tastes 29
mouthfeel 29
Name: text, dtype: int64
This is almost 4 times faster, without having to rewrite any of the pandas code. This code can be scaled linearly to any number of cores, making this simple pandas code usable in production environments.