defgenerate_sbm(n, sizes, p_matrix, seed=None): """ Generate a graph from the Stochastic Block Model (SBM).
Parameters: ---------- n : int Total number of nodes. sizes : list of int Sizes of each community (sum(sizes) == n). p_matrix : 2D array-like of shape (r, r) Connection probability matrix between communities. seed : int, optional Random seed for reproducibility.
Returns: ------- G : networkx.Graph Generated SBM graph with node attribute 'block' indicating community. labels : np.ndarray of shape (n,) Community label for each node. """ if seed isnotNone: np.random.seed(seed) r = len(sizes) assert p_matrix.shape == (r, r), "p_matrix must be r x r"
# Assign block labels labels = np.hstack([[i] * size for i, size inenumerate(sizes)])
# Initialize adjacency matrix A = np.zeros((n, n), dtype=int) for i inrange(n): for j inrange(i + 1, n): pi = labels[i] pj = labels[j] if np.random.rand() < p_matrix[pi, pj]: A[i, j] = A[j, i] = 1
# Build graph G = nx.from_numpy_array(A) # Assign block attribute for idx, lbl inenumerate(labels): G.nodes[idx]['block'] = int(lbl)
defgenerate_contextual_sbm(n, pi, p_matrix, theta, feature_dist='gaussian', seed=None): """ Generate a graph with attributes from the Contextual SBM (cSBM).
Parameters: ---------- n : int Total number of nodes. pi : list or np.ndarray of shape (r,) Mixing proportions for each of r communities. p_matrix : 2D array-like of shape (r, r) Connection probability matrix between communities. theta : list of dict Parameters for each community's feature distribution. For Gaussian: {'mean': array, 'cov': array} feature_dist : str Distribution type ('gaussian'). seed : int, optional Random seed for reproducibility.
Returns: ------- G : networkx.Graph Generated cSBM graph with node attributes 'block' and 'features'. labels : np.ndarray of shape (n,) Community label for each node. X : np.ndarray of shape (n, d) Feature matrix. """ if seed isnotNone: np.random.seed(seed) r = len(pi) assert p_matrix.shape == (r, r), "p_matrix must be r x r" assertlen(theta) == r, "theta must have one entry per community"
# Sample community labels labels = np.random.choice(r, size=n, p=pi) # Generate features # Assume Gaussian mixture d = theta[0]['mean'].shape[0] X = np.zeros((n, d)) for i inrange(n): z = labels[i] params = theta[z] X[i] = np.random.multivariate_normal(mean=params['mean'], cov=params['cov'])
# Generate adjacency A = np.zeros((n, n), dtype=int) for i inrange(n): for j inrange(i + 1, n): if np.random.rand() < p_matrix[labels[i], labels[j]]: A[i, j] = A[j, i] = 1
# Build graph G = nx.from_numpy_array(A) for idx inrange(n): G.nodes[idx]['block'] = int(labels[idx]) G.nodes[idx]['features'] = X[idx]
return G, labels, X
分析
形式确实很简单,有点类似回归中解释是高斯分布。
我们可以来看上篇的《Is homophily a necessity for graph neural networks? 》是怎么使用的。
我们考虑一个图 G,其中每个节点 i 具有特征 x∈ R 标签 y。
我们假设
(1) 节点 i 的特征从特征分布 F 中采样,即 x∼ F,μ(F)表示其均值;
(2) x 的维度之间相互独立;
(3) X 中的特征被一个正标量 B 所限制,即 max|X[i, j]| ≤ B;
(4) 对于节点 i,其邻居的标签从邻居分布 D 中独立采样。采样重复 deg(i)次以采样 deg(i)个邻居的标签。
定理1: 单个GCN操作激活前的输出的期望是:
$$
E[h_i]=W(E_{c\sim D_{y_i},x\sim F_c}[x])
$$
以及对于任何 t > 0,观测值与其期望的距离大于 t 的概率是有界的:
$$
P(||h_i-E[h_i]||_2\ge t)\le 2l\cdot exp(-\frac{deg(i)t^2}{2\rho^2(W)B^2l})
$$