演習：離反予測モデルを構築しよう

田中VPoE:「さあ、ここまで学んだモデルを実際に構築して比較してみよう。NetShop の離反予測データに対して、ロジスティック回帰、ランダムフォレスト、勾配ブースティングの3つのモデルを構築し、最適なモデルを選定してくれ。」

あなた:「データ準備から前処理、モデル構築、比較まで一通りやるんですね。」

田中VPoE:「そうだ。Step 2 で学んだ前処理パイプラインも組み合わせて、再現可能な形で構築してほしい。」

ミッション概要

NetShop の顧客離反予測のためのモデルを複数構築し、比較・選定します。前処理からモデル構築・評価までの一連のパイプラインを実装してください。

Mission 1: データ準備と前処理パイプラインの構築

リアルに近い離反予測データを生成し、前処理パイプラインを構築してください。

要件

5000件の顧客データを生成（離反率8%）
数値特徴量とカテゴリカル特徴量を含める
学習/検証/テストの3分割を行う
ColumnTransformer で前処理パイプラインを構築する

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

np.random.seed(42)
# ここにデータ生成と前処理パイプラインのコードを書く

解答例

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

np.random.seed(42)
n = 5000

# データ生成
df = pd.DataFrame({
    'purchase_count': np.random.poisson(8, n),
    'avg_amount': np.random.lognormal(8, 0.8, n).astype(int),
    'days_inactive': np.random.exponential(25, n).astype(int),
    'page_views': np.random.poisson(30, n),
    'support_tickets': np.random.poisson(1.5, n),
    'membership_months': np.random.exponential(18, n).astype(int),
    'device_type': np.random.choice(['PC', 'スマホ', 'タブレット'], n, p=[0.3, 0.55, 0.15]),
    'membership_rank': np.random.choice(['ブロンズ', 'シルバー', 'ゴールド'], n, p=[0.5, 0.3, 0.2]),
    'payment_method': np.random.choice(['クレジット', '銀行振込', 'コンビニ'], n, p=[0.5, 0.3, 0.2]),
})

# 離反ラベル
churn_prob = 1 / (1 + np.exp(
    -(df['days_inactive'] - 35) / 12
    + df['purchase_count'] * 0.08
    - df['support_tickets'] * 0.15
))
df['is_churned'] = (np.random.random(n) < churn_prob).astype(int)

# 特徴量生成
df['purchase_per_view'] = df['purchase_count'] / df['page_views'].replace(0, 1)
df['support_per_purchase'] = df['support_tickets'] / df['purchase_count'].replace(0, 1)
df['log_amount'] = np.log1p(df['avg_amount'])

# 欠損値追加
df.loc[np.random.random(n) < 0.03, 'avg_amount'] = np.nan
df.loc[np.random.random(n) < 0.02, 'page_views'] = np.nan

print(f"データ件数: {len(df)}, 離反率: {df['is_churned'].mean():.1%}")

# 特徴量とラベル分離
numeric_features = ['purchase_count', 'avg_amount', 'days_inactive', 'page_views',
                    'support_tickets', 'membership_months', 'purchase_per_view',
                    'support_per_purchase', 'log_amount']
onehot_features = ['device_type', 'payment_method']
ordinal_features = ['membership_rank']

feature_cols = numeric_features + onehot_features + ordinal_features
X = df[feature_cols]
y = df['is_churned']

# データ分割
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"学習: {len(X_train)}, 検証: {len(X_val)}, テスト: {len(X_test)}")

# 前処理パイプライン
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ]), numeric_features),
    ('onehot', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False)),
    ]), onehot_features),
    ('ordinal', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=[['ブロンズ', 'シルバー', 'ゴールド']])),
    ]), ordinal_features),
])

Mission 2: 3つのモデルを構築し比較する

ロジスティック回帰、ランダムフォレスト、LightGBM の3モデルを構築し、交差検証で比較してください。

要件

各モデルを Pipeline に組み込む
5-fold 交差検証で F1スコアを計算する
学習時間も記録する
結果を表形式で比較する

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import time

# ここにモデル構築と比較のコードを書く

解答例

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb_module
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report
import time

models = {
    'ロジスティック回帰': Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(max_iter=1000, random_state=42, C=1.0)),
    ]),
    'ランダムフォレスト': Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)),
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('model', lgb_module.LGBMClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, verbose=-1)),
    ]),
}

results = []
for name, pipeline in models.items():
    start = time.time()
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
    elapsed = time.time() - start

    # テストデータでの評価
    pipeline.fit(X_train, y_train)
    y_pred_test = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred_test)

    results.append({
        'モデル': name,
        'CV F1（平均）': scores.mean(),
        'CV F1（標準偏差）': scores.std(),
        'テスト F1': test_f1,
        '学習時間（秒）': elapsed,
    })

    print(f"\n{'='*50}")
    print(f"{name}")
    print(f"{'='*50}")
    print(f"CV F1: {scores.mean():.3f} (+/- {scores.std():.3f})")
    print(f"テスト F1: {test_f1:.3f}")
    print(f"学習時間: {elapsed:.2f}秒")

# 比較表
comparison_df = pd.DataFrame(results)
print(f"\n{'='*60}")
print("モデル比較結果")
print(comparison_df.to_string(index=False))

Mission 3: 最適モデルの選定と詳細分析

最も精度が高いモデルを選び、詳細な分析を行ってください。

要件

最適モデルの混同行列を表示する
特徴量重要度を可視化する
ROC曲線を描画する
モデル選定の理由を3つ以上述べる

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, roc_auc_score
)
import matplotlib.pyplot as plt

# ここに詳細分析のコードを書く

解答例

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, roc_auc_score, classification_report
)
import matplotlib.pyplot as plt

# 最良モデル（LightGBM）を選定
best_pipeline = models['LightGBM']
best_pipeline.fit(X_train, y_train)

y_pred = best_pipeline.predict(X_test)
y_prob = best_pipeline.predict_proba(X_test)[:, 1]

# 1. 分類レポート
print("=== 分類レポート ===")
print(classification_report(y_test, y_pred, target_names=['継続', '離反']))

# 2. 混同行列
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, display_labels=['継続', '離反'], ax=axes[0]
)
axes[0].set_title('混同行列')

# 3. ROC曲線
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
axes[1].plot(fpr, tpr, label=f'LightGBM (AUC = {auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_xlabel('偽陽性率')
axes[1].set_ylabel('真陽性率')
axes[1].set_title('ROC曲線')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()

# 4. 特徴量重要度
lgbm_model = best_pipeline.named_steps['model']
feature_names_out = best_pipeline.named_steps['preprocessor'].get_feature_names_out()
importance = pd.Series(lgbm_model.feature_importances_, index=feature_names_out)
importance = importance.sort_values(ascending=True)

plt.figure(figsize=(10, 8))
importance.tail(15).plot(kind='barh', color='steelblue')
plt.title('特徴量重要度 TOP 15（LightGBM）')
plt.xlabel('重要度')
plt.tight_layout()
plt.show()

# 5. 選定理由
print("\n=== モデル選定理由 ===")
print("1. F1スコアが3モデル中最も高い")
print("2. 学習速度が速く、定期的な再学習に適している")
print("3. 特徴量重要度が取得でき、ビジネスへの説明が可能")
print("4. 不均衡データへの対応パラメータが豊富")
print(f"5. AUC = {auc:.3f} と高い識別力を持つ")

達成度チェック

リアルに近い離反予測データを生成した
前処理パイプラインを ColumnTransformer で構築した
3つのモデルを構築し、交差検証で比較した
最適モデルの混同行列と ROC 曲線を描画した
特徴量重要度を可視化した
モデル選定の理由を論理的に述べた

推定所要時間: 90分