import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['Source Han Sans SC', 'Noto Sans CJK SC',
'WenQuanYi Micro Hei', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 加载LaLonde数据(使用模拟数据展示方法)
np.random.seed(42)
n = 1000
# 生成协变量
age = np.random.normal(25, 7, n)
education = np.random.normal(10, 2.5, n)
black = np.random.binomial(1, 0.5, n)
hispanic = np.random.binomial(1, 0.2, n)
married = np.random.binomial(1, 0.3, n)
nodegree = np.random.binomial(1, 0.6, n)
# 生成收入前置变量
re74 = np.random.lognormal(8, 1.5, n)
re75 = np.random.lognormal(8, 1.5, n)
# 处理变量(受协变量影响)
X_vars = np.column_stack([age, education, black, hispanic, married, nodegree, re74, re75])
X_scaled = (X_vars - X_vars.mean(axis=0)) / X_vars.std(axis=0)
# 倾向得分(Logit模型)
logit_ps = (0.3 * X_scaled[:, 0] - 0.5 * X_scaled[:, 1] + 0.8 * X_scaled[:, 2]
+ 0.3 * X_scaled[:, 5] - 0.2 * X_scaled[:, 6] / 1000)
ps = 1 / (1 + np.exp(-logit_ps))
treat = np.random.binomial(1, ps)
# 真实处理效应
true_effect = 1794
# 结果变量(收入)
re78_base = (1000 + 50 * age + 200 * education + 300 * married
+ 0.3 * re74 + 0.3 * re75 + np.random.normal(0, 3000, n))
re78 = re78_base + true_effect * treat
# 创建数据框
data = pd.DataFrame({
'treat': treat,
'age': age,
'education': education,
'black': black,
'hispanic': hispanic,
'married': married,
'nodegree': nodegree,
're74': re74,
're75': re75,
're78': re78,
'ps': ps
})
print(f"样本量: {n}")
print(f"处理组: {treat.sum()} ({treat.mean()*100:.1f}%)")
print(f"对照组: {(1-treat).sum()} ({(1-treat).mean()*100:.1f}%)")
print("\n数据前5行:")
print(data.head())