zhensongren · July 14, 2025 19:37
diff --git a/ccf.py b/ccf.py
 # %% [markdown]
 # A 4×4 grid of scatter plots
 # Each showing the relationship between lagged ts1 (predictor) and ts2 (target)

 # %%
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from darts import TimeSeries
 from sklearn.linear_model import LinearRegression

 # Step 1: Simulate two Darts TimeSeries
 np.random.seed(42)
 n = 100
 dates = pd.date_range(start="2020-01-01", periods=n, freq="D")
 series_1 = pd.Series(np.random.normal(50, 10, size=n), index=dates)
 series_2 = pd.Series(0.7 * series_1.shift(2).fillna(method='bfill') + np.random.normal(0, 5, size=n), index=dates)

 ts_1 = TimeSeries.from_series(series_1.rename("ts1"))
 ts_2 = TimeSeries.from_series(series_2.rename("ts2"))

 # Step 2: Plotting function using Darts TimeSeries
 def darts_cross_corr_plot(ts_x: TimeSeries, ts_y: TimeSeries, max_lag: int = 12):
    df_x = ts_x.to_dataframe()
    df_y = ts_y.to_dataframe()
    target_col = df_y.columns[0]
    input_col = df_x.columns[0]

    fig, axes = plt.subplots(4, 4, figsize=(16, 12))
    axes = axes.flatten()

    for lag in range(max_lag + 1):
        x_lagged = df_x.shift(lag)
        df_comb = pd.concat([x_lagged, df_y], axis=1).dropna()
        x_vals = df_comb[input_col].values.reshape(-1, 1)
        y_vals = df_comb[target_col].values

        ax = axes[lag]
        sns.scatterplot(x=x_vals.flatten(), y=y_vals, ax=ax, s=20, color="teal")

        model = LinearRegression().fit(x_vals, y_vals)
        x_line = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
        y_line = model.predict(x_line)
        ax.plot(x_line, y_line, color='red', linewidth=1)

        corr = np.corrcoef(x_vals.flatten(), y_vals)[0, 1]
        ax.set_title(f"{input_col}(t-{lag}), r={corr:.2f}")

    for i in range(max_lag + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

 # Step 3: Use the function
 darts_cross_corr_plot(ts_x=ts_1, ts_y=ts_2, max_lag=12)

 # %% [markdown]
 # Generate cross correlation plots from pandas dataframe series.

 # %%
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.linear_model import LinearRegression

 # Step 1: Create example time series
 np.random.seed(42)
 n = 150
 rain = np.random.gamma(shape=2.0, scale=15.0, size=n)
 ndvi = 0.5 * np.roll(rain, 1) + np.random.normal(0, 10, size=n)

 df = pd.DataFrame({'rain': rain, 'ndvi': ndvi})

 # Step 2: Define plot function
 def cross_corr_plot_matrix(df, x_col, y_col, max_lag=12):
    fig, axes = plt.subplots(4, 4, figsize=(16, 12))
    axes = axes.flatten()

    for lag in range(max_lag + 1):
        x_lagged = df[x_col].shift(lag)
        mask = x_lagged.notna() & df[y_col].notna()
        x_vals = x_lagged[mask].values.reshape(-1, 1)
        y_vals = df[y_col][mask].values

        ax = axes[lag]
        sns.scatterplot(x=x_vals.flatten(), y=y_vals, ax=ax, s=20, color='black')

        # Regression line
        model = LinearRegression().fit(x_vals, y_vals)
        x_line = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
        y_line = model.predict(x_line)
        ax.plot(x_line, y_line, color='red', linewidth=1)

        # Pearson correlation
        corr = np.corrcoef(x_vals.flatten(), y_vals)[0, 1]
        ax.set_title(f"{x_col}(t-{lag}), r={corr:.2f}")

    # Hide unused subplots
    for i in range(max_lag + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

 # Step 3: Run the plot
 cross_corr_plot_matrix(df, x_col='rain', y_col='ndvi', max_lag=12)


 # %%


 # %%
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from statsmodels.tsa.stattools import ccf

 # Simulated example data
 np.random.seed(42)
 n = 100
 # rain is the leading variable
 rain = np.random.normal(10, 2, n)
 # NDVI lags behind rain; create ndvi with roll and lag 2 and add noise
 ndvi = np.roll(rain, 2) + np.random.normal(0, 1, n)  # NDVI lags behind rain
 data = pd.DataFrame({'rain': rain, 'ndvi': ndvi})

 # Standardize the series (recommended before computing CCF)
 rain_std = (data['rain'] - np.mean(data['rain'])) / np.std(data['rain'])
 ndvi_std = (data['ndvi'] - np.mean(data['ndvi'])) / np.std(data['ndvi'])


 # Compute cross-correlation for full range of lags
 max_lag = 20
 lags = np.arange(-max_lag, max_lag + 1)
 ccf_full = []

 for lag in lags:
    if lag < 0:
        x = rain_std[:lag]
        y = ndvi_std[-lag:]
    elif lag > 0:
        x = rain_std[lag:]
        y = ndvi_std[:-lag]
    else:
        x = rain_std
        y = ndvi_std

    corr = np.corrcoef(x, y)[0, 1]
    ccf_full.append(corr)

 # Plot
 plt.figure(figsize=(10, 5))
 plt.stem(lags, ccf_full)
 plt.axhline(0, color='black')
 conf_level = 2 / np.sqrt(n)  # 95% confidence bound (approx)
 plt.axhline(conf_level, color='blue', linestyle='--')
 plt.axhline(-conf_level, color='blue', linestyle='--')
 plt.title("Cross-Correlation Function (rain vs. NDVI)")
 plt.xlabel("Lag")
 plt.ylabel("Correlation")
 plt.grid(True)
 plt.show()
	# %% [markdown]
	# A 4×4 grid of scatter plots
	# Each showing the relationship between lagged ts1 (predictor) and ts2 (target)

	# %%
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from darts import TimeSeries
	from sklearn.linear_model import LinearRegression

	# Step 1: Simulate two Darts TimeSeries
	np.random.seed(42)
	n = 100
	dates = pd.date_range(start="2020-01-01", periods=n, freq="D")
	series_1 = pd.Series(np.random.normal(50, 10, size=n), index=dates)
	series_2 = pd.Series(0.7 * series_1.shift(2).fillna(method='bfill') + np.random.normal(0, 5, size=n), index=dates)

	ts_1 = TimeSeries.from_series(series_1.rename("ts1"))
	ts_2 = TimeSeries.from_series(series_2.rename("ts2"))

	# Step 2: Plotting function using Darts TimeSeries
	def darts_cross_corr_plot(ts_x: TimeSeries, ts_y: TimeSeries, max_lag: int = 12):
	df_x = ts_x.to_dataframe()
	df_y = ts_y.to_dataframe()
	target_col = df_y.columns[0]
	input_col = df_x.columns[0]

	fig, axes = plt.subplots(4, 4, figsize=(16, 12))
	axes = axes.flatten()

	for lag in range(max_lag + 1):
	x_lagged = df_x.shift(lag)
	df_comb = pd.concat([x_lagged, df_y], axis=1).dropna()
	x_vals = df_comb[input_col].values.reshape(-1, 1)
	y_vals = df_comb[target_col].values

	ax = axes[lag]
	sns.scatterplot(x=x_vals.flatten(), y=y_vals, ax=ax, s=20, color="teal")

	model = LinearRegression().fit(x_vals, y_vals)
	x_line = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
	y_line = model.predict(x_line)
	ax.plot(x_line, y_line, color='red', linewidth=1)

	corr = np.corrcoef(x_vals.flatten(), y_vals)[0, 1]
	ax.set_title(f"{input_col}(t-{lag}), r={corr:.2f}")

	for i in range(max_lag + 1, len(axes)):
	fig.delaxes(axes[i])

	plt.tight_layout()
	plt.show()

	# Step 3: Use the function
	darts_cross_corr_plot(ts_x=ts_1, ts_y=ts_2, max_lag=12)

	# %% [markdown]
	# Generate cross correlation plots from pandas dataframe series.

	# %%
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.linear_model import LinearRegression

	# Step 1: Create example time series
	np.random.seed(42)
	n = 150
	rain = np.random.gamma(shape=2.0, scale=15.0, size=n)
	ndvi = 0.5 * np.roll(rain, 1) + np.random.normal(0, 10, size=n)

	df = pd.DataFrame({'rain': rain, 'ndvi': ndvi})

	# Step 2: Define plot function
	def cross_corr_plot_matrix(df, x_col, y_col, max_lag=12):
	fig, axes = plt.subplots(4, 4, figsize=(16, 12))
	axes = axes.flatten()

	for lag in range(max_lag + 1):
	x_lagged = df[x_col].shift(lag)
	mask = x_lagged.notna() & df[y_col].notna()
	x_vals = x_lagged[mask].values.reshape(-1, 1)
	y_vals = df[y_col][mask].values

	ax = axes[lag]
	sns.scatterplot(x=x_vals.flatten(), y=y_vals, ax=ax, s=20, color='black')

	# Regression line
	model = LinearRegression().fit(x_vals, y_vals)
	x_line = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
	y_line = model.predict(x_line)
	ax.plot(x_line, y_line, color='red', linewidth=1)

	# Pearson correlation
	corr = np.corrcoef(x_vals.flatten(), y_vals)[0, 1]
	ax.set_title(f"{x_col}(t-{lag}), r={corr:.2f}")

	# Hide unused subplots
	for i in range(max_lag + 1, len(axes)):
	fig.delaxes(axes[i])

	plt.tight_layout()
	plt.show()

	# Step 3: Run the plot
	cross_corr_plot_matrix(df, x_col='rain', y_col='ndvi', max_lag=12)


	# %%


	# %%
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from statsmodels.tsa.stattools import ccf

	# Simulated example data
	np.random.seed(42)
	n = 100
	# rain is the leading variable
	rain = np.random.normal(10, 2, n)
	# NDVI lags behind rain; create ndvi with roll and lag 2 and add noise
	ndvi = np.roll(rain, 2) + np.random.normal(0, 1, n) # NDVI lags behind rain
	data = pd.DataFrame({'rain': rain, 'ndvi': ndvi})

	# Standardize the series (recommended before computing CCF)
	rain_std = (data['rain'] - np.mean(data['rain'])) / np.std(data['rain'])
	ndvi_std = (data['ndvi'] - np.mean(data['ndvi'])) / np.std(data['ndvi'])


	# Compute cross-correlation for full range of lags
	max_lag = 20
	lags = np.arange(-max_lag, max_lag + 1)
	ccf_full = []

	for lag in lags:
	if lag < 0:
	x = rain_std[:lag]
	y = ndvi_std[-lag:]
	elif lag > 0:
	x = rain_std[lag:]
	y = ndvi_std[:-lag]
	else:
	x = rain_std
	y = ndvi_std

	corr = np.corrcoef(x, y)[0, 1]
	ccf_full.append(corr)

	# Plot
	plt.figure(figsize=(10, 5))
	plt.stem(lags, ccf_full)
	plt.axhline(0, color='black')
	conf_level = 2 / np.sqrt(n) # 95% confidence bound (approx)
	plt.axhline(conf_level, color='blue', linestyle='--')
	plt.axhline(-conf_level, color='blue', linestyle='--')
	plt.title("Cross-Correlation Function (rain vs. NDVI)")
	plt.xlabel("Lag")
	plt.ylabel("Correlation")
	plt.grid(True)
	plt.show()