I am using kaggle natural disaster twitter dataset to build a text classification. Apart from the text data, I would like to incorporate a separate numerical column called disaster_count which is built below in my classification model.
traindf = pd.read_csv('../train.csv',
dtype={'id': np.int16, 'target': np.int8})
disaster = ['wildfire','flooding','hurricane','earthquake','tornado','volcano','fire']
traindf['disaster_count'] = traindf['text'].apply(lambda x: len([i for i in str(x).split(' ') if i in disaster]))
So far I have discovered combining text and numerical data in a pipe object and feed it into a classifier. The following is my code for building the pipe.
# Data cleaning
def remove_punct(text):
table=str.maketrans('','',string.punctuation)
return(text.translate(table))
def remove_url(text):
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'',text)
# https://stackoverflow.com/a/49146722/330558
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
stopwords = set(stopwords.words('english'))
def remove_stopwords(text):
filtered =[i for i in word_tokenize(text) if not i.lower() in stopwords]
return filtered
traindf['text'] = traindf['text'].apply(lambda x : remove_url(x))
traindf['text'] = traindf['text'].apply(lambda x : remove_emoji(x))
traindf['text'] = traindf['text'].apply(lambda x : remove_punct(x))
# Splitting the dataset into train and test
label = traindf.target
X_train, X_test, y_train, y_test = train_test_split(features,label,test_size = 0.33,random_state= 42)
# Get the disaster_count feature
def get_numeric_data(x):
data = [r for r in x.disaster_count]
return(data)
# Get the text data
def get_text_data(x):
corpus = [r for r in x.text]
return(corpus)
transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)
# Building pipe
pipe = Pipeline([('features',FeatureUnion([
('numeric',Pipeline([('selector',transfomer_numeric)])),
('text',Pipeline([('selector',transformer_text),('vec',TfidfVectorizer(stop_words= 'english',analyzer='word'))])
)])),
('clf', RandomForestClassifier())])
pipe.fit(X_train,y_train)
After running the code above this is the error I got.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 5100, expected 1.
Any idea what is causing this please? I checked both output from the transformers should be the same. Thanks!
len(transformer_text.fit_transform(X_train))
Out[187]: 5100
len(transformer_numeric.fit_transform(X_train))
Out[188]: 5100
``