"""Functions for measuring the quality of a partition (into communities). """ from itertools import combinations import networkx as nx from networkx import NetworkXError from networkx.algorithms.community.community_utils import is_partition from networkx.utils import not_implemented_for from networkx.utils.decorators import argmap __all__ = ["coverage", "modularity", "performance", "partition_quality"] class NotAPartition(NetworkXError): """Raised if a given collection is not a partition.""" def __init__(self, G, collection): msg = f"{G} is not a valid partition of the graph {collection}" super().__init__(msg) def _require_partition(G, partition): """Decorator to check that a valid partition is input to a function Raises :exc:`networkx.NetworkXError` if the partition is not valid. This decorator should be used on functions whose first two arguments are a graph and a partition of the nodes of that graph (in that order):: >>> @require_partition ... def foo(G, partition): ... print("partition is valid!") ... >>> G = nx.complete_graph(5) >>> partition = [{0, 1}, {2, 3}, {4}] >>> foo(G, partition) partition is valid! >>> partition = [{0}, {2, 3}, {4}] >>> foo(G, partition) Traceback (most recent call last): ... networkx.exception.NetworkXError: `partition` is not a valid partition of the nodes of G >>> partition = [{0, 1}, {1, 2, 3}, {4}] >>> foo(G, partition) Traceback (most recent call last): ... networkx.exception.NetworkXError: `partition` is not a valid partition of the nodes of G """ if is_partition(G, partition): return G, partition raise nx.NetworkXError("`partition` is not a valid partition of the nodes of G") require_partition = argmap(_require_partition, (0, 1)) def intra_community_edges(G, partition): """Returns the number of intra-community edges for a partition of `G`. Parameters ---------- G : NetworkX graph. partition : iterable of sets of nodes This must be a partition of the nodes of `G`. The "intra-community edges" are those edges joining a pair of nodes in the same block of the partition. """ return sum(G.subgraph(block).size() for block in partition) def inter_community_edges(G, partition): """Returns the number of inter-community edges for a partition of `G`. according to the given partition of the nodes of `G`. Parameters ---------- G : NetworkX graph. partition : iterable of sets of nodes This must be a partition of the nodes of `G`. The *inter-community edges* are those edges joining a pair of nodes in different blocks of the partition. Implementation note: this function creates an intermediate graph that may require the same amount of memory as that of `G`. """ # Alternate implementation that does not require constructing a new # graph object (but does require constructing an affiliation # dictionary): # # aff = dict(chain.from_iterable(((v, block) for v in block) # for block in partition)) # return sum(1 for u, v in G.edges() if aff[u] != aff[v]) # MG = nx.MultiDiGraph if G.is_directed() else nx.MultiGraph return nx.quotient_graph(G, partition, create_using=MG).size() def inter_community_non_edges(G, partition): """Returns the number of inter-community non-edges according to the given partition of the nodes of `G`. Parameters ---------- G : NetworkX graph. partition : iterable of sets of nodes This must be a partition of the nodes of `G`. A *non-edge* is a pair of nodes (undirected if `G` is undirected) that are not adjacent in `G`. The *inter-community non-edges* are those non-edges on a pair of nodes in different blocks of the partition. Implementation note: this function creates two intermediate graphs, which may require up to twice the amount of memory as required to store `G`. """ # Alternate implementation that does not require constructing two # new graph objects (but does require constructing an affiliation # dictionary): # # aff = dict(chain.from_iterable(((v, block) for v in block) # for block in partition)) # return sum(1 for u, v in nx.non_edges(G) if aff[u] != aff[v]) # return inter_community_edges(nx.complement(G), partition) @not_implemented_for("multigraph") @require_partition def performance(G, partition): """Returns the performance of a partition. .. deprecated:: 2.6 Use `partition_quality` instead. The *performance* of a partition is the number of intra-community edges plus inter-community non-edges divided by the total number of potential edges. Parameters ---------- G : NetworkX graph A simple graph (directed or undirected). partition : sequence Partition of the nodes of `G`, represented as a sequence of sets of nodes. Each block of the partition represents a community. Returns ------- float The performance of the partition, as defined above. Raises ------ NetworkXError If `partition` is not a valid partition of the nodes of `G`. References ---------- .. [1] Santo Fortunato. "Community Detection in Graphs". *Physical Reports*, Volume 486, Issue 3--5 pp. 75--174 """ # Compute the number of intra-community edges and inter-community # edges. intra_edges = intra_community_edges(G, partition) inter_edges = inter_community_non_edges(G, partition) # Compute the number of edges in the complete graph (directed or # undirected, as it depends on `G`) on `n` nodes. # # (If `G` is an undirected graph, we divide by two since we have # double-counted each potential edge. We use integer division since # `total_pairs` is guaranteed to be even.) n = len(G) total_pairs = n * (n - 1) if not G.is_directed(): total_pairs //= 2 return (intra_edges + inter_edges) / total_pairs @require_partition def coverage(G, partition): """Returns the coverage of a partition. .. deprecated:: 2.6 Use `partition_quality` instead. The *coverage* of a partition is the ratio of the number of intra-community edges to the total number of edges in the graph. Parameters ---------- G : NetworkX graph partition : sequence Partition of the nodes of `G`, represented as a sequence of sets of nodes. Each block of the partition represents a community. Returns ------- float The coverage of the partition, as defined above. Raises ------ NetworkXError If `partition` is not a valid partition of the nodes of `G`. Notes ----- If `G` is a multigraph, the multiplicity of edges is counted. References ---------- .. [1] Santo Fortunato. "Community Detection in Graphs". *Physical Reports*, Volume 486, Issue 3--5 pp. 75--174 """ intra_edges = intra_community_edges(G, partition) total_edges = G.number_of_edges() return intra_edges / total_edges def modularity(G, communities, weight="weight", resolution=1): r"""Returns the modularity of the given partition of the graph. Modularity is defined in [1]_ as .. math:: Q = \frac{1}{2m} \sum_{ij} \left( A_{ij} - \gamma\frac{k_ik_j}{2m}\right) \delta(c_i,c_j) where $m$ is the number of edges, $A$ is the adjacency matrix of `G`, $k_i$ is the degree of $i$, $\gamma$ is the resolution parameter, and $\delta(c_i, c_j)$ is 1 if $i$ and $j$ are in the same community else 0. According to [2]_ (and verified by some algebra) this can be reduced to .. math:: Q = \sum_{c=1}^{n} \left[ \frac{L_c}{m} - \gamma\left( \frac{k_c}{2m} \right) ^2 \right] where the sum iterates over all communities $c$, $m$ is the number of edges, $L_c$ is the number of intra-community links for community $c$, $k_c$ is the sum of degrees of the nodes in community $c$, and $\gamma$ is the resolution parameter. The resolution parameter sets an arbitrary tradeoff between intra-group edges and inter-group edges. More complex grouping patterns can be discovered by analyzing the same network with multiple values of gamma and then combining the results [3]_. That said, it is very common to simply use gamma=1. More on the choice of gamma is in [4]_. The second formula is the one actually used in calculation of the modularity. For directed graphs the second formula replaces $k_c$ with $k^{in}_c k^{out}_c$. Parameters ---------- G : NetworkX Graph communities : list or iterable of set of nodes These node sets must represent a partition of G's nodes. weight : string or None, optional (default="weight") The edge attribute that holds the numerical value used as a weight. If None or an edge does not have that attribute, then that edge has weight 1. resolution : float (default=1) If resolution is less than 1, modularity favors larger communities. Greater than 1 favors smaller communities. Returns ------- Q : float The modularity of the paritition. Raises ------ NotAPartition If `communities` is not a partition of the nodes of `G`. Examples -------- >>> import networkx.algorithms.community as nx_comm >>> G = nx.barbell_graph(3, 0) >>> nx_comm.modularity(G, [{0, 1, 2}, {3, 4, 5}]) 0.35714285714285715 >>> nx_comm.modularity(G, nx_comm.label_propagation_communities(G)) 0.35714285714285715 References ---------- .. [1] M. E. J. Newman "Networks: An Introduction", page 224. Oxford University Press, 2011. .. [2] Clauset, Aaron, Mark EJ Newman, and Cristopher Moore. "Finding community structure in very large networks." Phys. Rev. E 70.6 (2004). .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community Detection" Phys. Rev. E 74, 016110, 2006. https://doi.org/10.1103/PhysRevE.74.016110 .. [4] M. E. J. Newman, "Equivalence between modularity optimization and maximum likelihood methods for community detection" Phys. Rev. E 94, 052315, 2016. https://doi.org/10.1103/PhysRevE.94.052315 """ if not isinstance(communities, list): communities = list(communities) if not is_partition(G, communities): raise NotAPartition(G, communities) directed = G.is_directed() if directed: out_degree = dict(G.out_degree(weight=weight)) in_degree = dict(G.in_degree(weight=weight)) m = sum(out_degree.values()) norm = 1 / m**2 else: out_degree = in_degree = dict(G.degree(weight=weight)) deg_sum = sum(out_degree.values()) m = deg_sum / 2 norm = 1 / deg_sum**2 def community_contribution(community): comm = set(community) L_c = sum(wt for u, v, wt in G.edges(comm, data=weight, default=1) if v in comm) out_degree_sum = sum(out_degree[u] for u in comm) in_degree_sum = sum(in_degree[u] for u in comm) if directed else out_degree_sum return L_c / m - resolution * out_degree_sum * in_degree_sum * norm return sum(map(community_contribution, communities)) @require_partition def partition_quality(G, partition): """Returns the coverage and performance of a partition of G. The *coverage* of a partition is the ratio of the number of intra-community edges to the total number of edges in the graph. The *performance* of a partition is the number of intra-community edges plus inter-community non-edges divided by the total number of potential edges. This algorithm has complexity $O(C^2 + L)$ where C is the number of communities and L is the number of links. Parameters ---------- G : NetworkX graph partition : sequence Partition of the nodes of `G`, represented as a sequence of sets of nodes (blocks). Each block of the partition represents a community. Returns ------- (float, float) The (coverage, performance) tuple of the partition, as defined above. Raises ------ NetworkXError If `partition` is not a valid partition of the nodes of `G`. Notes ----- If `G` is a multigraph; - for coverage, the multiplicity of edges is counted - for performance, the result is -1 (total number of possible edges is not defined) References ---------- .. [1] Santo Fortunato. "Community Detection in Graphs". *Physical Reports*, Volume 486, Issue 3--5 pp. 75--174 """ node_community = {} for i, community in enumerate(partition): for node in community: node_community[node] = i # `performance` is not defined for multigraphs if not G.is_multigraph(): # Iterate over the communities, quadratic, to calculate `possible_inter_community_edges` possible_inter_community_edges = sum( len(p1) * len(p2) for p1, p2 in combinations(partition, 2) ) if G.is_directed(): possible_inter_community_edges *= 2 else: possible_inter_community_edges = 0 # Compute the number of edges in the complete graph -- `n` nodes, # directed or undirected, depending on `G` n = len(G) total_pairs = n * (n - 1) if not G.is_directed(): total_pairs //= 2 intra_community_edges = 0 inter_community_non_edges = possible_inter_community_edges # Iterate over the links to count `intra_community_edges` and `inter_community_non_edges` for e in G.edges(): if node_community[e[0]] == node_community[e[1]]: intra_community_edges += 1 else: inter_community_non_edges -= 1 coverage = intra_community_edges / len(G.edges) if G.is_multigraph(): performance = -1.0 else: performance = (intra_community_edges + inter_community_non_edges) / total_pairs return coverage, performance