import numpy as np

# Wipe all outputs from this notebook
from IPython.display import Image, clear_output, display
clear_output(True)

# Import local plotting functions and in-notebook display functions
import matplotlib.pyplot as plt
%matplotlib inline

import networkx as nx
## Load the full coauthorship network
fpath = "../resources/ca-AstroPh.txt.gz"

# fpath = "../resources/ca-CondMat.txt.gz"
g = nx.read_edgelist(fpath)

## Create a subgraph of the 1000 most connected authors
subgraph = sorted(g.degree, key=lambda x: x[1], reverse=True)[:4000]
subgraph = [x[0] for x in subgraph]
g2 = g.subgraph(subgraph)
# rename nodes to sequential integers as they would appear in an adjacency matrix
g2 = nx.convert_node_labels_to_integers(g2, first_label=0)

pos = nx.spring_layout(g2)
# pos = nx.kamada_kawai_layout(g2)
# nx.draw_spring(g2, pos=pos, node_size=10, node_color='black', edge_color='gray', width=0.5)
nx.draw(g2, pos=pos, node_size=5, node_color='black', edge_color='gray', width=0.5, alpha=0.5)
plt.show()

A = nx.adjacency_matrix(g2).todense()
print(A.shape)
# find sparsity

n = A.shape[0]
density = np.sum(A != 0) / (n * (n-1))
print("Density: {:.2f}%".format(density * 100))

(4000, 4000)
Density: 1.26%

plt.figure(figsize=(10, 10))
# plt.spy(A[:500, :500], color='k') # a spy plot is a plot of the sparsity pattern of a matrix
plt.spy(A, markersize=0.05, color='k') # a spy plot is a plot of the sparsity pattern of a matrix
plt.xlabel("Author")    
plt.ylabel("Author")

Text(0, 0.5, 'Author')

# degree distribution
degrees = np.sum(A, axis=0)
plt.figure(figsize=(10, 5))
plt.hist(degrees, bins=100);
plt.xlabel("Degree (number of coauthors))")
plt.ylabel("Number of Authors")

Text(0, 0.5, 'Number of Authors')

class GraphRandomWalk:
    """A class for performing random walks on a graph

    Parameters:
        A (np.ndarray): The adjacency matrix of the graph
        random_state (int): The random seed to use
        store_history (bool): Whether to store the history of the random walk
    """

    def __init__(self, A, random_state=None, store_history=False):
        self.A = A
        self.n = A.shape[0]
        self.degrees = np.sum(A, axis=0)
        self.D = np.diag(self.degrees)

        self.random_state = random_state
        self.store_history = store_history

        np.random.seed(self.random_state)

        if self.store_history:
            self.history = []

    def step(self, curr):
        """
        Take a single step from a given node to any of its neighbors with equal 
        probability

        Args:
            curr (int): The current node

        Returns:
            nxt (int): The next
        """
        choices = A[curr, :].nonzero()[0]
        nxt = np.random.choice(choices)
        return nxt


    def random_walk(self, start, steps):
        """Perform a random walk on the graph

        Args:
            start (int): The starting node
            steps (int): The number of steps to take

        Returns:
            stop (int): The final node
        """
        curr = start
        if self.store_history:
            self.history.append(start)
        for _ in range(steps):
            curr = self.step(curr)
            if self.store_history:
                self.history.append(curr)

        return curr

model = GraphRandomWalk(A, random_state=0, store_history=True)

# simulate a random walk starting from node 0 for 130 timesteps
model.random_walk(0, 130)

plt.plot(model.history)
plt.xlabel("Step")
plt.ylabel("Node")

Text(0, 0.5, 'Node')

plt.figure(figsize=(8, 8))
nx.draw(g2, pos=pos, node_size=5, node_color='black', edge_color='gray', width=0.5, alpha=0.5)
traj = [pos[item] for item in model.history]
plt.plot(*zip(*traj), color='red', linewidth=2)
plt.show()

all_traj = []
for _ in range(1000):
    model = GraphRandomWalk(A,  store_history=True)
    model.random_walk(0, 130) # simulate a random walk starting from node 0 for 130 timesteps
    all_traj.append(np.array(model.history).copy())

plt.figure(figsize=(8, 8))
for traj in all_traj:
    traj = [pos[item] for item in traj]
    plt.plot(*zip(*traj), color='k', linewidth=.2, alpha=0.01)
plt.axis('off');

# The degree of each authors
degrees = np.sum(A, axis=0)

# The number of times each author was visited
all_visits = np.hstack(all_traj)
vals, bins = np.histogram(all_visits, bins=np.arange(0, len(degrees) + 1))


plt.figure(figsize=(8, 8))
plt.plot(degrees, vals, '.k', markersize=20, alpha=0.1)
plt.xlim(0, np.percentile(degrees, 95))
plt.ylim(0, np.percentile(vals, 95))

plt.xlabel("Degree")
plt.ylabel("Number of visits")

Text(0, 0.5, 'Number of visits')

class FirstPassageTime(GraphRandomWalk):
    """A class for computing the first passage time distribution

    Parameters:
        A (np.ndarray): The adjacency matrix of the graph
        random_state (int): The random seed to use
        max_iter (int): The maximum number of iterations to use
        store_history (bool): Whether to store the history of the random walk
    """

    def __init__(self, A, max_iter=10000, random_state=None, store_history=False):
        self.A = A
        self.max_iter = max_iter
        self.random_state = random_state
        self.store_history = store_history
        np.random.seed(self.random_state)

        if self.store_history:
            self.history = []


    def fpt(self, start, stop):
        """Compute a single first passage time from a starting node to a stopping node

        Args:
            start (int): The starting node
            stop (int): The stopping node
            steps (int): The maximum number of steps to take

        Returns:
            fpt (int): The first passage time
        """
        curr = start
        if self.store_history:
            self.history.append(start)

        for i in range(self.max_iter):
            curr = self.step(curr)
            if self.store_history:
                self.history.append(curr)
            if curr == stop:
                return i

        return np.inf


fpt = FirstPassageTime(A, random_state=0, store_history=True, max_iter=100000)
fpt.fpt(0, 10)

4948

all_paths_close, all_fpts_close = [], []
all_paths_far, all_fpts_far = [], []

for _ in range(100):
    fpt = FirstPassageTime(A, store_history=True, max_iter=100000)
    all_fpts_close.append(fpt.fpt(0, 3))
    all_paths_close.append(fpt.history)

    fpt = FirstPassageTime(A, store_history=True, max_iter=100000)
    all_fpts_far.append(fpt.fpt(0, 100))
    all_paths_far.append(fpt.history)

# print number that didn't converge within max_iter
print("Number of paths that didn't converge within max_iter: {}".format(np.sum(np.isinf(all_fpts_close))))
print("Number of paths that didn't converge within max_iter: {}".format(np.sum(np.isinf(all_fpts_far))))

Number of paths that didn't converge within max_iter: 0
Number of paths that didn't converge within max_iter: 0

plt.semilogy()
plt.hist(all_fpts_far, bins=100, alpha=0.5, label="Further authors", zorder=5);
plt.hist(all_fpts_close, bins=100, alpha=0.5, label="Close authors", color='r');
plt.xlabel("First Passage Time")
plt.ylabel("Number of Paths")
plt.xlim([0, np.max(all_fpts_far)])
plt.legend()

print("Mean FPT between distant authors: {:.2f}".format(np.mean(all_fpts_far)))
print("Mean FPT between close authors: {:.2f}".format(np.mean(all_fpts_close)))

Mean FPT between distant authors: 8825.31
Mean FPT between close authors: 4038.19

plt.figure(figsize=(8, 8))
for traj1, traj2 in zip(all_paths_far[:10], all_paths_close[:100]):
    traj1 = [pos[item] for item in traj1]
    traj2 = [pos[item] for item in traj2]
    plt.plot(*zip(*traj1), color='b', linewidth=.3, alpha=0.1)
    plt.plot(*zip(*traj2), color='r', linewidth=.3, alpha=0.1)
plt.axis('off')

(-0.7772509068250656,
 0.5980996698141098,
 -0.9608237326145173,
 0.6776723444461823)

# make the transition matrix from the adjacency matrix

# normalize the adjacency matrix
D = np.diag(np.sum(A, axis=0))
T = np.linalg.inv(D) @ A

# check that the rows sum to 1
np.sum(T, axis=1)

# compute the first passage time distribution
fpt = np.linalg.inv(np.identity(T.shape[0]) - T)

print("Mean FPT for distant authors: {:.2f}".format(fpt[0, 1]))
print("Mean FPT for closer authors: {:.2f}".format(fpt[0, 15]))

Mean FPT for distant authors: 3488637199167.52
Mean FPT for closer authors: 5465531612029.12

# PageRank centrality

# normalize the adjacency matrix
D = np.diag(np.sum(A, axis=0))
T = np.linalg.inv(D) @ A

d = 0.85 # damping factor (probability of following a link)
page_rank = np.linalg.inv(np.identity(T.shape[0]) - d * A @ np.linalg.inv(D)) @ np.ones((T.shape[0], 1))

# plot matrix with networkx with node size proportional to and colored by page rank centrality
plt.figure(figsize=(10, 10))
nx.draw(g2, pos=pos, node_size=page_rank*4, node_color=page_rank, edge_color='gray', width=0.01, cmap='viridis')
plt.show()

# compare pagerank and degree centrality
plt.figure(figsize=(6, 6))
plt.plot(degrees, page_rank, '.', alpha=0.5)
plt.xlabel("Degree")
plt.ylabel("PageRank")

Text(0, 0.5, 'PageRank')

Complex networks and graph theory¶

A large matrix dataset: coauthorship among physicists¶

We can think of graphs as large, (often) sparse matrices¶

A random walk on the graph of `astro-ph` coauthorship¶

Does the degree alone determine the probability of visiting a node?¶

First passage times on the network of collaborators¶

Can we compute this analytically?¶

Complex networks and graph theory¶

A large matrix dataset: coauthorship among physicists¶

We can think of graphs as large, (often) sparse matrices¶

A random walk on the graph of astro-ph coauthorship¶

Does the degree alone determine the probability of visiting a node?¶

First passage times on the network of collaborators¶

Can we compute this analytically?¶

A random walk on the graph of `astro-ph` coauthorship¶