Source code for absorbing_centrality.graph_helpers

"""
    graph_helpers
    ~~~~~~~~~~

    Provides helper functions that are used for:
        * graph preprocessing
        * matrix genereration (fundamental, transition)
        * Sherman-Morrison formula
        * absorbing random-walk centrality computation


    :copyright: 2015 Charalampos Mavroforakis, <cmav@bu.edu> and contributors
    :license: ISC

"""
from __future__ import division

from sys import maxint

from networkx import (DiGraph, adjacency_matrix, connected_component_subgraphs,
                      relabel_nodes)
from numpy import arange, argwhere, array, eye, matrix, zeros
from scipy.sparse import eye as speye, csc_matrix, diags
from scipy.sparse.linalg import inv as spinv, spilu

from .exceptions import CanonicalizationError

SUPER_NODE = '_super_'


def keep_largest_component(G):
    """Keeps the largest connected component of the graph and removes all the
    self loops.

    Parameters
    ----------
    G : NetworkX graph
        The input graph.

    Returns
    -------
    NetworkX graph
        The largest connected subgraph of G. All self-loops have been removed.

    """
    G_largest = sorted(connected_component_subgraphs(G),
                       key=len, reverse=True)[0]
    G_largest.remove_edges_from(G_largest.selfloop_edges())
    return G_largest


[docs]def is_canonical(G): """Tests if the graph has been canonicalized. Parameters ---------- G : NetworkX graph Returns ------- bool Returns True, if the graph has been canonicalized. """ if 'label_map' in G.graph and 'canonical_map' in G.graph: return True else: return False
[docs]def canonical_relabel_nodes(G): """Relabels the nodes in the graph, such that the new names belong in the set [1,n]. The labeling information is stored in the dictionaries `G.graph['canonical_map']` and `G.graph['label_map']`. These provide a way to map original to canonical node names and vice-versa, respectively. Parameters ---------- G : NetworkX graph Returns ------- G_prime : NetworkX graph The relabeled graph. It includes two attributes: (i) `G_prime.graph['canonical_map']` : dict Holds the mapping between the original names of the nodes and the new, canonical, names (original -> new). (ii) `G_prime.graph['label_map']` : dict Holds the mapping between the new, canonical, names and the original names of the nodes (new -> original). Note: The relabeling of a particular node might not be consistent across two consecutive runs. Also, the relabeling happens on a copy, so the original graph will be untouched. """ n = G.number_of_nodes() mapping = dict(zip(G.nodes(), range(1, n + 1))) G_prime = relabel_nodes(G, mapping, copy=True) G_prime.graph['canonical_map'] = mapping.copy() G_prime.graph['label_map'] = dict((v, k) for k, v in mapping.iteritems()) return G_prime
[docs]def compute_fundamental_matrix(P, fast=True, drop_tol=1e-5, fill_factor=1000): """Computes the fundamental matrix for an absorbing random walk. Parameters ---------- P : scipy.sparse matrix The transition probability matrix of the absorbing random walk. To construct this matrix, you start from the original transition matrix and delete the rows that correspond to the absorbing nodes. fast : bool, optional If True (default), use the iterative SuperLU solver from scipy.sparse.linalg. drop_tol : float, optional If `fast` is True, the `drop_tol` parameter of the SuperLU solver is set to this value (default is 1e-5). fill_factor: int, optional If `If `fast` is True, the `fill_factor` parameter of the SuperLU solver is set to this value (default is 1000). Returns ------- F : scipy.sparse matrix The fundamental matrix of the random walk. Element (i,j) holds the expected number of times the random walk will be in state j before absorption, when it starts from state i. For more information, check [1]_. References ---------- .. [1] Doyle, Peter G., and J. Laurie Snell. Random walks and electric networks. Carus mathematical monographs 22 (2000). https://math.dartmouth.edu/~doyle/docs/walks/walks.pdf """ n = P.shape[0] F_inv = speye(n, format='csc') - P.tocsc() if fast: solver = spilu(F_inv, drop_tol=drop_tol, fill_factor=fill_factor) F = matrix(solver.solve(eye(n))) else: F = spinv(F_inv).todense() return F
[docs]def compute_personalized_transition_matrix(G, alpha=0.85, restart_set=[SUPER_NODE]): """Returns the transition matrix of the random walk with restarts. Parameters ---------- G : graph alpha : float, optional The probability of the random surfer to continue their walk (default is 0.85). restart_set : list, optional The set of nodes to restart from. If not supplied, the restarts lead to the supernode (default is [SUPER_NODE]). Returns ------- P : scipy.sparse.matrix The probability matrix for the random walk with restarts. """ if not has_supernode(G) and SUPER_NODE in restart_set: raise CanonicalizationError('Cannot restart the random walks at the ' 'supernode') canonical_restart_set = [G.graph['canonical_map'][n] for n in restart_set] restart_graph = DiGraph() restart_edges = [(n, q) for n in G.nodes() for q in canonical_restart_set] # add a self loop edge at the supernode, if there is one, to avoid division # by zero when computing the transition matrix if has_supernode(G): # TODO Why is this edge added -- and then removed? restart_edges.append((G.graph['canonical_map'][SUPER_NODE], G.graph['canonical_map'][SUPER_NODE])) restart_graph.add_edges_from(restart_edges) P = compute_transition_matrix(G) P_restart = compute_transition_matrix(restart_graph) if has_supernode(G): # remove the bottom right corner (the added self-loop) P_restart[-1, -1] = 0 P_final = alpha * P + (1 - alpha) * P_restart # TODO do the transition probabilities from SUPER_NODE sum up to 1? # Does it matter? return P_final
[docs]def compute_transition_matrix(G): r"""Builds the random transition matrix P. The probability of going from node `i` to node`j` is equal to: .. math:: P_{i,j} = \frac{1}{\text{degree}(i)} Parameters ---------- G : NetworkX graph Returns ------- P : scipy.sparse matrix The random transition probability matrix. """ # TODO: make this function work for weighted graphs adjacency = adjacency_matrix(G) degrees = adjacency.sum(axis=1) inverse_degrees = 1 / degrees P = diags(inverse_degrees.T.tolist(), [0]) * adjacency return P
def _fast_update_fundamental_rows(P, F, row=0, row_previous=0): """Updates the rows of the fundamental matrix by adding an absorbing node, using the Woodbury identity. Parameters ---------- P : matrix The transition matrix of the graph. F : matrix The fundamental matrix of the graph after setting `row_previous` to be an absorbing node. row : int, optional The index of the row of `F` that will be set to be absorbing (default is 0). row_previous : int, optional The index of the row of `P` that was set to be absorbing when computing `F` (default is 0). Returns ------- F_updated_row : matrix The new fundamental matrix, where row of the absorbing node has been updated. """ n_P = P.shape[0] n_F = F.shape[0] # These are the rows of the matrix P that were used in order to compute F previous_non_absorbing = range(row_previous) + range(row_previous + 1, n_P) # Re-order the transition matrix P without row and column `row_previous`, # such that the row (and column) that is absorbing will be first. reordering = [row] + range(row) + range(row + 1, n_F) P_non_absorbing = P[previous_non_absorbing, :][:, previous_non_absorbing] F_reordered = F.take(reordering, axis=0).take(reordering, axis=1) U = csc_matrix((n_F, 1)) U[0, 0] = 1.0 V = P_non_absorbing[row, reordering] y = 1.0 + V.dot(F_reordered[:, 0])[0, 0] VF = V.dot(F_reordered) F_updated_row = F_reordered - F_reordered[:, 0].dot(VF) / y return F_updated_row def _fast_update_fundamental_columns(P, F, col=0, col_previous=0): """Updates the columns of the fundamental matrix by adding an absorbing node, using the Woodbury identity. Parameters ---------- P : matrix The transition matrix of the graph. F : matrix The fundamental matrix of the graph after setting `col_previous` to be an absorbing node. `F` needs to be the result of the _fast_update_fundamental_rows(), otherwise the order of the columns is wrong. col : int, optional The index of the column of `F` that corresponds to the new absorbing node (default is 0). col_previous : int, optional The index of the column of `P` that corresponds to the absorbing node when computing `F` (default is 0). Returns ------- F_updated_col : matrix The new fundamental matrix, in which the column of the absorbing node has been updated. """ n_P = P.shape[0] n_F = F.shape[0] # These are the rows of the matrix P that were used in order to compute FP previous_round_indices = range(col_previous) + range(col_previous + 1, n_P) # Re-order the transition matrix P without row and column `col_previous`, # such that the col (and row) that is absorbing will be first. current_indices = [col] + range(col) + range(col + 1, n_F) P_non_absorbing = P[previous_round_indices, :][:, previous_round_indices] # F is the result of the _fast_update_fundamental_rows(), so it is already # re-arranged in the way we want it. F_reordered = F U = P_non_absorbing[current_indices, col].A.T y = 1.0 + F_reordered[0, :].dot(U)[0, 0] FU = F_reordered.dot(U) F_updated_col = F_reordered - FU.dot(F_reordered[0, :]) / y return F_updated_col
[docs]def update_fundamental_matrix(P, F, next, previous, previous_index=0, node_order=None): """Applies Woodbury's formula to update the fundamental matrix in order to avoid doing an inversion. Parameters ---------- P : matrix The transition matrix of the graph, where `previous` is non absorbing. F : matrix The fundamental matrix of the graph after setting the node `previous` as an absorbing node. next : int The node that will be set as absorbing next. The result of this call will result in a fundamental matrix where `next` is an absorbing node. previous : int The node that was set as absorbing when computing F. previous_index : int, optional The row/col index of node `previous` in P (default is 0). node_order : list, optional The nodes that corresponds to the rows/cols of `P`, in order. If not supplied, the order is considered to be [0, .. , n_P - 1], where n_P is the the number of rows/cols in `P`. Returns ------- P_updated : matrix The new transition matrix, where `previous` is absorbing. F_updated : matrix The fundamental matrix after adding `previous` and `next` in the set of absorbing nodes. node_order_updated : list The new order of the non absorbing nodes in the `F_new`. next_index : int The row/col index of the node `next`, that we just set as absorbing, in `P_new`. """ n_P = P.shape[0] n_F = F.shape[0] if node_order is None: node_order = arange(n_P) node_order = node_order[range(previous) + range(previous + 1, n_P)] previous_index = previous if type(node_order) is list: node_order = array(node_order) # We need to find which row/column corresponds to the node `next` next_index = argwhere(array(node_order) == next)[0] node_order = node_order[range(next_index) + range(next_index + 1, n_F)] row_update = _fast_update_fundamental_rows(P, F, row=next_index, row_previous=previous_index) col_update = _fast_update_fundamental_columns(P, row_update, col=next_index, col_previous=previous_index) P_updated = P[range(previous_index) + range(previous_index + 1, P.shape[0]), :] \ [:, range(previous_index) + range(previous_index + 1, P.shape[0])] F_updated = col_update[1:, 1:] node_order_updated = node_order.tolist() return P_updated, F_updated, node_order_updated, next_index
[docs]def add_supernode(G, query=None): """Adds a supernode to the graph and connects it with directed edges to the query nodes. Parameters ---------- G : NetworkX graph The graph in which we want to add a supernode. query : list, default is None The list of nodes that the supernode will be connected to. If `query` is None, the supernode will be connected to all the nodes in `G`. These new edges will be directed. Returns ------- NetworkX graph A directed graph with the supernode, and the new edges, added. The attributes of the graph, i.e. 'label_map' and 'canonical_map', are also updated (or created if the input graph was not canonicalized) to reflect the new node. """ if has_supernode(G): return G n = len(G) if query is None or not query: query = G.nodes() if not is_canonical(G): G = canonical_relabel_nodes(keep_largest_component(G)) n = len(G) query = [G.graph['canonical_map'][q] for q in query] if not G.is_directed(): G = G.to_directed() G.add_node(n + 1) if SUPER_NODE in G.graph['canonical_map']: # make sure that SUPER_NODE is the last node returned by nodes() nodes = G.nodes() if not G.graph['label_map'][nodes[-1]] == SUPER_NODE: raise CanonicalizationError('Supernode is not the last node.') G.graph['canonical_map'][SUPER_NODE] = n + 1 G.graph['label_map'][n + 1] = SUPER_NODE G.add_edges_from([(n + 1, q) for q in query]) return G
[docs]def has_supernode(G): """Checks if there exist a supernode in the graph. Parameters ---------- G : NetworkX graph Returns ------- has_supernode : bool """ if is_canonical(G) and SUPER_NODE in G.graph['canonical_map']: return True else: return False
[docs]def absorbing_centrality_inversion(G, team, query=None, with_restarts=False, alpha=0.85): """Compute the absorbing centrality of a team using a fast inversion with SuperLU solver. Parameters ---------- G : NetworkX graph The graph on which to compute the centrality. team : list The team of nodes, whose centrality to compute. query : list, optional The set of query nodes to use for the random walks. If None (default) or empty, the query set is equal to the set of all nodes in the graph. with_restarts : bool, optional If True, restarts the random surfer to the the query set (default is False). alpha : float, optional The probability of the random surfer to continue (default is 0.85). Returns ------- score : float The absorbing centrality score. Note ---- Both `team` and `query` should use the original node names. """ if not is_canonical(G): G = canonical_relabel_nodes(keep_largest_component(G)) if query is None: query = [] canonical_query = [G.graph['canonical_map'][q] for q in query] G = add_supernode(G, canonical_query) if with_restarts: P = compute_personalized_transition_matrix(G, alpha) else: P = compute_transition_matrix(G) team_ind = [G.graph['canonical_map'][c] for c in team] non_absorbing_nodes = [v - 1 for v in G if v not in team_ind] P_non_absorbing = P[non_absorbing_nodes, :][:, non_absorbing_nodes] F = compute_fundamental_matrix(P_non_absorbing) row_sums = F.sum(axis=1) score = row_sums[-1].sum() return score
[docs]def absorbing_centrality(G, team, query=None, P=None, epsilon=1e-5, max_iterations=None, with_restarts=False, alpha=0.85): r"""Compute the absorbing centrality of a team. The algorithm works by iteratively computing the powers of the non-absorbing submatrix of the transition matrix P. Parameters ---------- G : NetworkX graph The graph on which to compute the centrality. team : list The team of nodes, whose centrality to compute. query : list, optional The set of query nodes to use for the random walks. If None (default) or empty, the query set is equal to the set of all nodes in the graph. P : matrix, optional The precomputed transition matrix of the graph (default is None). epsilon : float, optional The iterative algorithm stops when the error between the centrality computed by two successive iterations falls below epsilon (default is 1e-5). max_iterations : int, optional The upper limit to the number of iteratios of the algorithm (default is None). with_restarts : bool, optional If True, restarts the random surfer to the the query set (default is False). alpha : float, optional The probability of the random surfer to continue (default is 0.85). Returns ------- score : float The absorbing centrality score. Note ---- Both `team` and `query` should use the original node names. """ if not is_canonical(G): G = canonical_relabel_nodes(keep_largest_component(G)) if query is None: query = [] canonical_query = [G.graph['canonical_map'][q] for q in query] G = add_supernode(G, canonical_query) if P is None: if with_restarts: P = compute_personalized_transition_matrix(G, alpha) else: P = compute_transition_matrix(G) team_ind = [G.graph['canonical_map'][c] for c in team] non_absorbing_nodes = [v - 1 for v in G if v not in team_ind] P_non_absorbing = P[non_absorbing_nodes, :][:, non_absorbing_nodes] step = maxint s = zeros((G.number_of_nodes(), 1)) s[-1] = 1 X_current = s[non_absorbing_nodes] # Don't count the first jump, from the supernode to the queries score = 0 i = 1 while step > epsilon: X_current = P_non_absorbing.T.dot(X_current) # Disregard the steps from the supernode to the queries step = X_current.sum() - X_current[-1, 0] score += step i += 1 if max_iterations is not None and i > max_iterations: break return score