import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
# %matplotlib inline
import networkx as nxNetwork Science - UDD
Network Projections
Cristian Candia-Castro Vallejos, Ph.D.\(^{1,2}\)
Teaching assitant: #### Yessica Herrera-Guzmán, Ph.D.\(^{2, 3}\)
- [1] Data Science Institute (IDS), Universidad del Desarrollo,Chile
- [2] Northwestern Institute on Complex Systems, Kellogg School of Management, Northwestern Unviersity, USA
- [3] Center for Complex Network Research (CCNR), Northeastern Unviersity, USA
Número de personas que contrajeron COVID
Este es un conjunto de datos generado aleatoriamente para ilustrar la construcción de redes bipartitas a partir de conjuntos de datos tabulares empíricos.
En este conjunto de datos, hay variantes de COVID y los nombres enumerados para cada variante son aquellos que fueron infectados con esa variante.
El objetivo de esta práctica es que te familiarices con este tipo de datos estructurados que no están en forma de lista de aristas, sino que se asemejan a fuentes de datos de investigaciones empíricas. A partir de estos datos, deberías ser capaz de crear una red bipartita, así como la lista de nodos y la lista de aristas para cada una de las proyecciones de la red.
data = pd.read_csv('./Data/COVIDvariants.csv')data.head()| Variant1 | Variant2 | Variant3 | Variant4 | Variant5 | Variant6 | Variant7 | Variant8 | Variant9 | Variant10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Charlie | Frank | Quinn | Oliver | Charlie | Helen | Grace | Rachel | Michael | Grace |
| 1 | Sam | David | David | Nina | Nadia | Bob | Ivy | Laura | Tina | Ronald |
| 2 | Nina | Oliver | Charlie | Helen | Paul | Eva | Quinn | Sam | Ivy | Bob |
| 3 | Alice | Quinn | Bob | Quinn | Sam | Jack | Kevin | Michael | Frank | Nina |
| 4 | Kevin | Sam | Rachel | Jack | Kevin | Frank | Eva | Alice | Quinn | Laura |
Convert the dataset to the edgelist of a bipartite network, where one level (col1 = ‘variant’) are the variants and another level (col2 = ‘name’) are the infected individuals.
df = data.melt(var_name='variant', value_name='name').dropna()df| variant | name | |
|---|---|---|
| 0 | Variant1 | Charlie |
| 1 | Variant1 | Sam |
| 2 | Variant1 | Nina |
| 3 | Variant1 | Alice |
| 4 | Variant1 | Kevin |
| ... | ... | ... |
| 395 | Variant10 | Sarah |
| 396 | Variant10 | Paula |
| 397 | Variant10 | Ted |
| 398 | Variant10 | Arthur |
| 399 | Variant10 | Robert |
221 rows × 2 columns
g = nx.Graph()g = nx.from_pandas_edgelist(df, source='variant', target='name')# print(nx.info(g))
print(g)Graph with 69 nodes and 221 edges
g.degree('Ivy')4
g.degree('Variant1')41
variants = list(df.variant.unique())people = list(df.name.unique())layout = nx.spring_layout(g, k=0.05, iterations=50)plt.figure(figsize=(12, 12))
# Draw VARIANTS
nx.draw_networkx_nodes(g,
layout,
nodelist=variants,
node_size=80, # a LIST of sizes, based on g.degree
node_color='green',
edgecolors='green',
linewidths=1.5)
# Draw EVERYONE
nx.draw_networkx_nodes(g, layout,
nodelist=people,
node_color='navy',
edgecolors='navy',
alpha=0.9,
linewidths=0.7,
node_size=20)
# Draw REINFECTED PEOPLE
reinfected_people = [person for person in people if g.degree(person) > 1]
nx.draw_networkx_nodes(g, layout, nodelist=reinfected_people,
node_color='red',
edgecolors='navy',
linewidths=0.7,
node_size=40)
nx.draw_networkx_edges(g, layout, width=0.7, alpha=0.9,
edge_color="limegreen")
# label people
node_labels = dict(zip(people, people))
nx.draw_networkx_labels(g, layout, labels=node_labels,
font_size=8, font_weight='400')
# label variants
node_variants = dict(zip(variants, variants))
nx.draw_networkx_labels(g, layout, labels=node_variants,
font_size=12, font_weight='400')
# Turn off the axis because I know you don't want it
plt.axis('off')
plt.title("Network of COVID variants")
# plt.savefig('network_name.pdf', bbox_inches='tight', pad_inches=0)
# Tell matplotlib to show it
plt.show()
Project to unipartite: Infected people
# wide_df = dfa.pivot_table(index='variant', columns=df.groupby('variant').cumcount(), values='name', aggfunc='first')# wide_df| Name1 | Name2 | Name3 | Name4 | Name5 | Name6 | Name7 | Name8 | Name9 | Name10 | ... | Name32 | Name33 | Name34 | Name35 | Name36 | Name37 | Name38 | Name39 | Name40 | Name41 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| variant | |||||||||||||||||||||
| Variant1 | Charlie | Sam | Nina | Alice | Kevin | Helen | Paul | Grace | Frank | Rachel | ... | Melissa | Newt | Jules | Jackie | Leslie | Marcela | Maggie | Nadia | Lars | Mark |
| Variant10 | Grace | Ronald | Bob | Nina | Laura | Alice | Rachel | Ivy | Sam | Michael | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant2 | Frank | David | Oliver | Quinn | Sam | Jack | Paul | Charlie | Grace | Helen | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant3 | Quinn | David | Charlie | Bob | Rachel | Alice | Nadia | Sam | Tina | Jack | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant4 | Oliver | Nina | Helen | Quinn | Jack | Charlie | Paul | Laura | David | Alice | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant5 | Charlie | Nadia | Paul | Sam | Kevin | Rachel | Frank | David | Alice | Laura | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant6 | Helen | Bob | Eva | Jack | Frank | Martin | Julio | Sofia | Maggie | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant7 | Grace | Ivy | Quinn | Kevin | Eva | Sarah | Ronald | Alan | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant8 | Rachel | Laura | Sam | Michael | Alice | Nina | Ivy | Martin | Ronald | Rose | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant9 | Michael | Tina | Ivy | Frank | Quinn | Rachel | Leslie | Bob | Mike | Nina | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 41 columns
# wide_df.columns = [f'Name{i+1}' for i in range(wide_df.shape[1])]# wide_df| Name1 | Name2 | Name3 | Name4 | Name5 | Name6 | Name7 | Name8 | Name9 | Name10 | ... | Name32 | Name33 | Name34 | Name35 | Name36 | Name37 | Name38 | Name39 | Name40 | Name41 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| variant | |||||||||||||||||||||
| Variant1 | Charlie | Sam | Nina | Alice | Kevin | Helen | Paul | Grace | Frank | Rachel | ... | Melissa | Newt | Jules | Jackie | Leslie | Marcela | Maggie | Nadia | Lars | Mark |
| Variant10 | Grace | Ronald | Bob | Nina | Laura | Alice | Rachel | Ivy | Sam | Michael | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant2 | Frank | David | Oliver | Quinn | Sam | Jack | Paul | Charlie | Grace | Helen | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant3 | Quinn | David | Charlie | Bob | Rachel | Alice | Nadia | Sam | Tina | Jack | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant4 | Oliver | Nina | Helen | Quinn | Jack | Charlie | Paul | Laura | David | Alice | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant5 | Charlie | Nadia | Paul | Sam | Kevin | Rachel | Frank | David | Alice | Laura | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant6 | Helen | Bob | Eva | Jack | Frank | Martin | Julio | Sofia | Maggie | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant7 | Grace | Ivy | Quinn | Kevin | Eva | Sarah | Ronald | Alan | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant8 | Rachel | Laura | Sam | Michael | Alice | Nina | Ivy | Martin | Ronald | Rose | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Variant9 | Michael | Tina | Ivy | Frank | Quinn | Rachel | Leslie | Bob | Mike | Nina | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 41 columns
# # Alternativa 1: Funciona pero es menos eficiente
# # Initialize empty lists to store source and target names
# source = []
# target = []
# # Iterate through DataFrame rows
# for index, row in wide_df.iterrows():
# # Iterate through column names in the current row
# for i, name1 in enumerate(row):
# for j, name2 in enumerate(row[i+1:]):
# # Append source and target names to the lists
# source.append(name1)
# target.append(name2)
# # Create a new DataFrame from the source and target lists
# people_df = pd.DataFrame({'Source': source, 'Target': target})
# people_df = people_df.dropna()
# new_df = people_df.drop_duplicates()
# new_df = new_df[new_df['Source'] != new_df['Target']]
# h = nx.from_pandas_edgelist(new_df, source='Source', target='Target')# ALTERNATIVA 2 NetworkX (nativo de redes bipartitas)
import networkx as nx
# df: columnas ['name','variant']
B = nx.Graph()
B.add_nodes_from(df['name'].unique(), bipartite='people')
B.add_nodes_from(df['variant'].unique(), bipartite='variant')
B.add_edges_from(df[['name','variant']].itertuples(index=False, name=None))
people = [n for n, d in B.nodes(data=True) if d['bipartite']=='people']
# Proyección simple (arista si comparten ≥1 variante)
G = nx.algorithms.bipartite.projected_graph(B, people)
# Proyección ponderada (# de variantes compartidas)
h = nx.Graph()
h = nx.algorithms.bipartite.weighted_projected_graph(B, people)# ALTERNATIVA 3 Pandas + itertools (escala mejor)
from itertools import combinations
import pandas as pd
pairs = (df.dropna()
.groupby('variant')['name']
.apply(lambda s: list(combinations(sorted(s.unique()), 2)))
.explode().dropna())
edges = (pd.DataFrame(pairs.tolist(), columns=['Source','Target'])
.value_counts() # cuenta repeticiones
.reset_index(name='weight'))
# Si quieres un grafo:
import networkx as nx
h = nx.Graph()
h.add_weighted_edges_from(edges[['Source','Target','weight']].itertuples(index=False, name=None))# print(nx.info(h))
print(f"Número de nodos: {h.number_of_nodes()}")
print(f"Número de aristas: {h.number_of_edges()}")
print(f"Es dirigido?: {h.is_directed()}")Número de nodos: 59
Número de aristas: 1265
Es dirigido?: False
layout_h = nx.spring_layout(h, k=0.1, iterations=50)labels = {node: h.nodes[node].get('label', f'Node {node}') for node in h.nodes}plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(h,
layout_h,
node_size=[((h.degree(v)) ** 3)/200 for v in h.nodes()], # sizes based on h.degree
node_color='lavender',
edgecolors='rebeccapurple',
linewidths=1)
nx.draw_networkx_edges(h, layout_h, width=0.4, alpha=0.7, edge_color="gray")
node_labels = dict(zip(labels, labels))
nx.draw_networkx_labels(h, layout_h, labels=node_labels,
font_size=8)
plt.axis('off')
plt.title("Network of Infected People")
# plt.savefig('network_projection_name.pdf', bbox_inches='tight', pad_inches=0)
plt.show()
Tarea 1.2
Una semana de plazo.
Pregunta 1: ¿Cuáles son las redes proyectadas/unipartitas de cada conjunto de nodos? (es decir, redes separadas para cada conjunto)
Pregunta 2: ¿Qué información puedes obtener de cada proyección? y Explica la utilidad de las proyecciones de redes con respecto a la visualización y análisis de la red bipartita.
Ejercicio 1: Calcula el grado promedio de cada proyección de la red, luego calcula el grado promedio de los nodos persona y los nodos variante en la red bipartita. ¿Cuáles son las diferencias en la distribución? Por favor, explica por qué.
Ejercicio 2: Computa las propiedades de estas redes proyectadas (por ejemplo, tamaño, diámetro, distribución de grados).
Ejercicio 3: Construye la matriz de adyacencia de sus dos proyecciones, en los nodos morados y en los nodos verdes, respectivamente. Vea el ejemplo a continuación. Luego, explique la relevancia de cada proyección. Busquen en la documentación de networkx cómo construir la matriz de adyacencia. https://networkx.org/documentation/stable/reference/index.html