import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
@pipeline_stage
def load_data():
"""Load raw data from various sources"""
data = pd.read_csv('dataset.csv')
return data
@pipeline_stage
def clean_data(raw_data):
"""Clean and preprocess the data"""
cleaned = raw_data.drop_duplicates()
cleaned = cleaned.fillna(cleaned.mean())
return cleaned
@pipeline_stage
def feature_engineering(clean_data):
"""Extract and create new features"""
features = clean_data.copy()
features['interaction'] = features['col1'] * features['col2']
return features
@pipeline_stage
def split_data(features):
"""Split data into train/test sets"""
X = features.drop('target', axis=1)
y = features['target']
return train_test_split(X, y, test_size=0.2)
@pipeline_stage
def scale_features(X_train, X_test):
"""Normalize feature values"""
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
@pipeline_stage
def train_model(X_train, y_train):
"""Train the machine learning model"""
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
return model
if __name__ == "__main__":
pipeline = DataPipeline()
pipeline.run()