new file: compare-chose-deploy.ipynb new file: datasets/winequality-white.csv modified: logging-first-model.ipynb
28 KiB
记录第一个模型¶
from mlflow import MlflowClient from pprint import pprint from sklearn.ensemble import RandomForestRegressor
创建一个 MLflow 客户端对象¶
使用 mlflow server --host 127.0.0.1 --port 8080
启动一个服务。
# NOTE: 请确保与你启动的服务地址一致 client = MlflowClient(tracking_uri="http://127.0.0.1:8080")
使用 MLflow 客户端 API 搜索创建的实验¶
Let's take a look at the Default Experiment that is created for us.
This safe 'fallback' experiment will store Runs that we create if we don't specify a new experiment.
# Search experiments without providing query terms behaves effectively as a 'list' action all_experiments = client.search_experiments() print(all_experiments)
[<Experiment: artifact_location='mlflow-artifacts:/345492218691480896', creation_time=1705671588054, experiment_id='345492218691480896', last_update_time=1705671588054, lifecycle_stage='active', name='MLflow Quickstart', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1705670739503, experiment_id='0', last_update_time=1705670739503, lifecycle_stage='active', name='Default', tags={}>]
# Extract the experiment name and lifecycle_stage default_experiment = [ {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage} for experiment in all_experiments if experiment.name == "Default" ][0] pprint(default_experiment)
{'lifecycle_stage': 'active', 'name': 'Default'}
创建一个新实验¶
In this section, we'll:
- create a new MLflow Experiment
- apply metadata in the form of Experiment Tags
experiment_description = ( "This is the grocery forecasting project. " "This experiment contains the produce models for apples." ) experiment_tags = { "project_name": "grocery-forecasting", "store_dept": "produce", "team": "stores-ml", "project_quarter": "Q3-2023", "mlflow.note.content": experiment_description, } produce_apples_experiment = client.create_experiment(name="Apple_Models", tags=experiment_tags)
# Use search_experiments() to search on the project_name tag key apples_experiment = client.search_experiments( filter_string="tags.`project_name` = 'grocery-forecasting'" ) pprint(apples_experiment[0])
<Experiment: artifact_location='mlflow-artifacts:/715035017833909618', creation_time=1705818634613, experiment_id='715035017833909618', last_update_time=1705818634613, lifecycle_stage='active', name='Apple_Models', tags={'mlflow.note.content': 'This is the grocery forecasting project. This ' 'experiment contains the produce models for apples.', 'project_name': 'grocery-forecasting', 'project_quarter': 'Q3-2023', 'store_dept': 'produce', 'team': 'stores-ml'}>
# Access individual tag data print(apples_experiment[0].tags["team"])
stores-ml
运行第一个模型训练¶
In this section, we'll:
- create a synthetic data set that is relevant to a simple demand forecasting task
- start an MLflow run
- log metrics, parameters, and tags to the run
- save the model to the run
- register the model during model logging
生成苹果需求的综合数据¶
Keep in mind that this is purely for demonstration purposes.
The demand value is purely artificial and is deliberately covariant with the features. This is not a particularly realistic real-world scenario (if it were, we wouldn't need Data Scientists!).
import pandas as pd import numpy as np from datetime import datetime, timedelta def generate_apple_sales_data_with_promo_adjustment(base_demand: int = 1000, n_rows: int = 5000): """ Generates a synthetic dataset for predicting apple sales demand with seasonality and inflation. This function creates a pandas DataFrame with features relevant to apple sales. The features include date, average_temperature, rainfall, weekend flag, holiday flag, promotional flag, price_per_kg, and the previous day's demand. The target variable, 'demand', is generated based on a combination of these features with some added noise. Args: base_demand (int, optional): Base demand for apples. Defaults to 1000. n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000. Returns: pd.DataFrame: DataFrame with features and target variable for apple sales prediction. Example: >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000) >>> df.head() """ # Set seed for reproducibility np.random.seed(9999) # Create date range dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)] dates.reverse() # Generate features df = pd.DataFrame( { "date": dates, "average_temperature": np.random.uniform(10, 35, n_rows), "rainfall": np.random.exponential(5, n_rows), "weekend": [(date.weekday() >= 5) * 1 for date in dates], "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]), "price_per_kg": np.random.uniform(0.5, 3, n_rows), "month": [date.month for date in dates], } ) # 随着时间的推移引入通货膨胀(年) df["inflation_multiplier"] = 1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03 # 考虑到苹果收获的季节性 df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin( 2 * np.pi * (df["month"] - 9) / 12 ) # 根据收获效果修改price_per_kg df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5 # 调整促销期,使其与滞后高峰收获期 1 个月一致 peak_months = [4, 10] # months following the peak availability df["promo"] = np.where( df["month"].isin(peak_months), 1, np.random.choice([0, 1], n_rows, p=[0.85, 0.15]), ) # 根据特征生成目标变量 base_price_effect = -df["price_per_kg"] * 50 seasonality_effect = df["harvest_effect"] * 50 promo_effect = df["promo"] * 200 df["demand"] = ( base_demand + base_price_effect + seasonality_effect + promo_effect + df["weekend"] * 300 + np.random.normal(0, 50, n_rows) ) * df[ "inflation_multiplier" ] # 引入随机噪声 # Add previous day's demand df["previous_days_demand"] = df["demand"].shift(1) df["previous_days_demand"].fillna(method="bfill", inplace=True) # fill the first row # Drop temporary columns df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True) return df
# Generate the dataset! data = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=1_000) data[-20:]
/tmp/ipykernel_1170/724292291.py:84: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df["previous_days_demand"].fillna(method="bfill", inplace=True) # fill the first row
date | average_temperature | rainfall | weekend | holiday | price_per_kg | promo | demand | previous_days_demand | |
---|---|---|---|---|---|---|---|---|---|
980 | 2024-01-02 15:05:05.013229 | 34.130183 | 1.454065 | 0 | 0 | 1.449177 | 0 | 999.306290 | 1029.418398 |
981 | 2024-01-03 15:05:05.013227 | 32.353643 | 9.462859 | 0 | 0 | 2.856503 | 0 | 842.129427 | 999.306290 |
982 | 2024-01-04 15:05:05.013225 | 18.816833 | 0.391470 | 0 | 0 | 1.326429 | 0 | 990.616709 | 842.129427 |
983 | 2024-01-05 15:05:05.013223 | 34.533012 | 2.120477 | 0 | 0 | 0.970131 | 0 | 1068.802075 | 990.616709 |
984 | 2024-01-06 15:05:05.013222 | 23.057202 | 2.365705 | 1 | 0 | 1.049931 | 0 | 1346.486305 | 1068.802075 |
985 | 2024-01-07 15:05:05.013220 | 34.810165 | 3.089005 | 1 | 0 | 2.035149 | 0 | 1329.564672 | 1346.486305 |
986 | 2024-01-08 15:05:05.013218 | 29.208905 | 3.673292 | 0 | 0 | 2.518098 | 0 | 1086.143402 | 1329.564672 |
987 | 2024-01-09 15:05:05.013216 | 16.428676 | 4.077782 | 0 | 0 | 1.268979 | 0 | 1093.207186 | 1086.143402 |
988 | 2024-01-10 15:05:05.013214 | 32.067512 | 2.734454 | 0 | 0 | 0.762317 | 0 | 1069.939894 | 1093.207186 |
989 | 2024-01-11 15:05:05.013213 | 31.938203 | 13.883486 | 0 | 0 | 1.153301 | 0 | 994.409540 | 1069.939894 |
990 | 2024-01-12 15:05:05.013211 | 18.024055 | 7.544061 | 0 | 0 | 0.610703 | 0 | 1078.323183 | 994.409540 |
991 | 2024-01-13 15:05:05.013209 | 20.681067 | 18.820490 | 1 | 0 | 1.533488 | 0 | 1328.499120 | 1078.323183 |
992 | 2024-01-14 15:05:05.013207 | 16.010132 | 7.705941 | 1 | 0 | 1.632498 | 1 | 1548.922141 | 1328.499120 |
993 | 2024-01-15 15:05:05.013198 | 18.766455 | 6.274840 | 0 | 0 | 2.806554 | 0 | 956.412724 | 1548.922141 |
994 | 2024-01-16 15:05:05.013196 | 27.948793 | 23.705246 | 0 | 0 | 0.829464 | 0 | 1090.592622 | 956.412724 |
995 | 2024-01-17 15:05:05.013194 | 28.661072 | 10.329865 | 0 | 0 | 2.290591 | 0 | 936.465043 | 1090.592622 |
996 | 2024-01-18 15:05:05.013192 | 10.821693 | 3.575645 | 0 | 0 | 0.897473 | 0 | 1016.336362 | 936.465043 |
997 | 2024-01-19 15:05:05.013190 | 21.108560 | 6.221089 | 0 | 0 | 1.093864 | 0 | 1063.698477 | 1016.336362 |
998 | 2024-01-20 15:05:05.013187 | 29.451301 | 5.021463 | 1 | 0 | 2.493085 | 0 | 1306.255235 | 1063.698477 |
999 | 2024-01-21 15:05:05.013172 | 19.261458 | 0.438381 | 1 | 0 | 2.610422 | 0 | 1207.188828 | 1306.255235 |
训练并记录模型数据¶
We're now ready to import our model class and train a RandomForestRegressor
import mlflow from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # Use the fluent API to set the tracking uri and the active experiment mlflow.set_tracking_uri("http://127.0.0.1:8080") # Sets the current active experiment to the "Apple_Models" experiment and returns the Experiment metadata apple_experiment = mlflow.set_experiment("Apple_Models") # Define a run name for this iteration of training. # If this is not set, a unique name will be auto-generated for your run. run_name = "apples_rf_test" # Define an artifact path that the model will be saved to. artifact_path = "rf_apples"
# Split the data into features and target and drop irrelevant date field and target field X = data.drop(columns=["date", "demand"]) y = data["demand"] # 将数据分为训练集和测试集 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) params = { "n_estimators": 100, "max_depth": 6, "min_samples_split": 10, "min_samples_leaf": 4, "bootstrap": True, "oob_score": False, "random_state": 888, } # Train the RandomForestRegressor rf = RandomForestRegressor(**params) # Fit the model on the training data rf.fit(X_train, y_train) # Predict on the validation set y_pred = rf.predict(X_val) # Calculate error metrics mae = mean_absolute_error(y_val, y_pred) mse = mean_squared_error(y_val, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_val, y_pred) # Assemble the metrics we're going to write into a collection metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2} # Initiate the MLflow run context with mlflow.start_run(run_name=run_name) as run: # Log the parameters used for the model fit mlflow.log_params(params) # Log the error metrics that were calculated during validation mlflow.log_metrics(metrics) # Log an instance of the trained model for later use mlflow.sklearn.log_model(sk_model=rf, input_example=X_val, artifact_path=artifact_path)
/home/deck/miniconda3/envs/mlflow/lib/python3.10/site-packages/mlflow/models/signature.py:358: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. input_schema = _infer_schema(input_example)
Success!¶
You've just logged your first MLflow model!
Navigate to the MLflow UI to see the run that was just created (named "apples_rf_test", logged to the Experiment "Apple_Models").