Data is sourced from the Prague Public Transport Open Data portal, specifically the GTFS (General Transit Feed Specification) timetables.

import collections
import math
import warnings

import contextily as ctx
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.font_manager as fm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pywaffle as waff

from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
from matplotlib.lines import Line2D
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
np.random.seed(1)
plt.style.use("ggplot")
font = fm.FontProperties(size=9)
red = (226 / 255, 74 / 255, 51 / 255)
redl = (226 / 255, 74 / 255, 51 / 255, 0.6)
redf = (226 / 255, 74 / 255, 51 / 255, 1)
blue = (52 / 255, 138 / 255, 189 / 255)
grey = (100 / 255, 100 / 255, 100 / 255)

📚 Dataset

The datasets are loaded into memory, and basic information for preprocessing is displayed.

conns = pd.read_csv("d.csv")
stops = pd.read_csv("stops.txt")
conns.head()

stop_fromstop_from_namestop_tostop_to_namedepart_fromarrive_toroute_typeis_nightmondaytuesdaywednesdaythursdayfridaysaturdaysunday
0U2991Z301HněviceT58005Hněvice seř.n.4:53:004:54:30201111100
1T58005Hněvice seř.n.U4610Z301Záluží4:54:304:56:00201111100
2U4610Z301ZálužíU4609Z301Dobříň4:56:004:59:00201111100
3U4609Z301DobříňU4608Z301Roudnice nad Labem4:59:005:03:00201111100
4U4608Z301Roudnice nad LabemU4609Z301Dobříň4:36:004:38:00201111100
conns.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642433 entries, 0 to 1642432
Data columns (total 15 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   stop_from       1642433 non-null  object
 1   stop_from_name  1642433 non-null  object
 2   stop_to         1642433 non-null  object
 3   stop_to_name    1642433 non-null  object
 4   depart_from     1642433 non-null  object
 5   arrive_to       1642433 non-null  object
 6   route_type      1642433 non-null  int64 
 7   is_night        1642433 non-null  int64 
 8   monday          1642433 non-null  int64 
 9   tuesday         1642433 non-null  int64 
 10  wednesday       1642433 non-null  int64 
 11  thursday        1642433 non-null  int64 
 12  friday          1642433 non-null  int64 
 13  saturday        1642433 non-null  int64 
 14  sunday          1642433 non-null  int64 
dtypes: int64(9), object(6)
memory usage: 188.0+ MB
stops.head()

stop_idstop_namestop_latstop_lonzone_idstop_urllocation_typeparent_stationwheelchair_boardinglevel_idplatform_codeasw_node_idasw_stop_id
0U50S1Budějovická50.04441114.448787PNaN1NaN1NaNNaN50.0NaN
1U52S1Chodov50.03167214.490961PNaN1NaN1NaNNaN52.0NaN
2U75S1Kolbenova50.11039514.516398PNaN1NaN1NaNNaN75.0NaN
3U78S1Ládví50.12659114.469451PNaN1NaN1NaNNaN78.0NaN
4U100S1Vltavská50.10029814.438492PNaN1NaN1NaNNaN100.0NaN
stops.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16435 entries, 0 to 16434
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_id              16435 non-null  object 
 1   stop_name            15936 non-null  object 
 2   stop_lat             16435 non-null  float64
 3   stop_lon             16435 non-null  float64
 4   zone_id              15408 non-null  object 
 5   stop_url             0 non-null      float64
 6   location_type        16435 non-null  int64  
 7   parent_station       954 non-null    object 
 8   wheelchair_boarding  16435 non-null  int64  
 9   level_id             954 non-null    object 
 10  platform_code        14750 non-null  object 
 11  asw_node_id          15748 non-null  float64
 12  asw_stop_id          15354 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 1.6+ MB

⚙️ Data Preprocessing

This section details the transformation of key features into the correct formats and discards unused values.

The output of this section is two clean datasets: conns (connections) and stops (stops).

🚈 Connections - conns

⏳ Time Data

Time columns are converted to the correct format and validated. Each time value is represented by three columns: H, M, S.

def split_time_column(df, col):
    colH, colM, colS = f"{col}_H", f"{col}_M", f"{col}_S"
    cols = [colH, colM, colS]
    slice = df.loc[:, col].str.split(":", expand=True).astype("int64")
    slice[cols] = slice
    slice[colH] = slice[colH].mod(24)
    return slice[cols], cols
for col in ["depart_from", "arrive_to"]:
    if col in conns.columns:
        slice, cols = split_time_column(conns, col)
        conns.drop(col, axis=1, inplace=True)
        conns[cols] = slice
def to_timedelta(df, h_col, m_col, s_col):
    seconds = 3600 * df[h_col] + 60 * df[m_col] + df[s_col]
    return seconds


departures = to_timedelta(conns, "depart_from_H", "depart_from_M", "depart_from_S")
arrivals = to_timedelta(conns, "arrive_to_H", "arrive_to_M", "arrive_to_S")
fig, ax = plt.subplots(1, 1, figsize=(9, 4), layout="constrained")
ax.set_title("Time of departure")
ax.hist(departures / 3600, bins=24, range=(0, 24), alpha=0.6)
ax.hist(departures / 3600, bins=24, range=(0, 24), histtype="step", color=red, linewidth=1.5)
ax.set_xlabel("Time [h]")
ax.set_ylabel("Count")
ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0), useMathText=True)
ax.set_xticks(range(0, 25, 3));

png

📅 Days of the Week and Service Type

Relevant values are converted to a categorical type.

days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
for col in ["is_night", *days]:
    conns[col] = conns[col].astype("boolean")
days_dist = pd.DataFrame(conns[days].sum(), columns=["count"])
days_dist.style.format(precision=3, thousands=" ", decimal=".").format_index(str.upper, axis=0).format_index(str.upper, axis=1)
 COUNT
MONDAY1 159 992
TUESDAY1 159 986
WEDNESDAY1 159 986
THURSDAY1 159 986
FRIDAY1 160 097
SATURDAY492 743
SUNDAY504 771
fig, ax = plt.subplots(1, 1, figsize=(9, 4), layout="constrained")
ax.set_title("Number of departures")
ax.barh(days_dist["count"].index, days_dist["count"], alpha=0.85)
ax.set_ylabel("Day")
ax.set_xlabel("Count")
ax.ticklabel_format(axis="x", style="sci", scilimits=(0, 0), useMathText=True)
ax.invert_yaxis()
ax.grid(visible=False, which="major", axis="x")

png

🪪 Stop IDs

Unused stop IDs are dropped. Stops are identified by name.

conns.drop(["stop_from", "stop_to"], axis=1, inplace=True, errors="ignore")

🚢 Transport Modes

Transport modes are converted to a categorical data type.

route_type_map = {
    0: "Tram",
    1: "Metro",
    2: "Rail",
    3: "Bus",
    4: "Ferry",
    7: "Funicular",
}
conns["route_type"] = (conns["route_type"].map(route_type_map, na_action="ignore").astype("category"))
types = conns["route_type"].value_counts()
fig, axes = plt.subplots(1, 2, figsize=(12, 4), layout="constrained", gridspec_kw={"wspace": 0.1})
ax = axes[0]
ax.set_title("Route types")
ax.set_xlabel("Frequency")
ax.ticklabel_format(axis="x", style="sci", scilimits=(0, 0), useMathText=True)
bars = ax.barh(types.index, types, alpha=0.85)
ax.invert_yaxis()
ax.set_xlim(0, 1.2 * 10**6)
for bar, count in zip(bars, types):
    ax.text(bar.get_width() + 10**4, bar.get_y() + bar.get_height() / 2, f"{count}", va="center", fontsize=10)

val = types / 10000
val_freq = val / val.sum()
waff.Waffle.make_waffle(
    ax=axes[1], rows=12, values=val,
    title={"label": "Route types", "loc": "center"},
    labels=[f"{k} ({v*100:.2f}%)" for k, v in val_freq.items()],
    legend={"bbox_to_anchor": (1.5, 1), "ncol": 1, "framealpha": 0},
    icons=["bus", "train-tram", "train-subway", "train", "ship", "car"],
    font_size=16, icon_style="solid", icon_legend=True, starting_location="NW",
    vertical=True, cmap_name="Set2",
)

png

🚏 Stops - stops

🧭 Coordinates

Only coordinates are used from the stops dataset for plotting connections. Other features are discarded.

For stops with identical names, their average coordinates are used. Minor name variations for the same physical stops are not addressed here but will be manually adjusted later if needed.

stops = stops[["stop_name", "stop_lat", "stop_lon"]]
stops = stops.groupby(by=["stop_name"], dropna=True).mean()

🏭 Preprocessing Summary

After preprocessing, the two clean datasets, conns and stops, appear as follows (time columns are omitted for brevity):

conns.drop(['depart_from_H', 'depart_from_M', 'depart_from_S', 'arrive_to_H', 'arrive_to_M', 'arrive_to_S'], axis=1).head()

stop_from_namestop_to_nameroute_typeis_nightmondaytuesdaywednesdaythursdayfridaysaturdaysunday
0HněviceHněvice seř.n.RailFalseTrueTrueTrueTrueTrueFalseFalse
1Hněvice seř.n.ZálužíRailFalseTrueTrueTrueTrueTrueFalseFalse
2ZálužíDobříňRailFalseTrueTrueTrueTrueTrueFalseFalse
3DobříňRoudnice nad LabemRailFalseTrueTrueTrueTrueTrueFalseFalse
4Roudnice nad LabemDobříňRailFalseTrueTrueTrueTrueTrueFalseFalse
stops.head()

stop_latstop_lon
stop_name
AHr Km 11,48550.14655014.731470
Albertov50.06791714.420798
Ametystová49.98820114.362216
Amforová50.04177814.327298
Anděl50.07113214.403406

🕸️ Network Analysis

This section covers:

  • Constructing the transport network.
  • Visualizing key subgraphs.
  • Performing basic node analysis.

🛠️ Graph Construction

stop_pos = {n: np.array([c["stop_lon"], c["stop_lat"]]) for n, c in stops.iterrows()}
nodes = [(n, s.to_dict()) for n, s in stops.iterrows()]
# G.add_nodes_from(nodes)
def extract_edges(df):
    cc = df[["stop_from_name", "stop_to_name"]].copy()
    cc[["stop_from_name", "stop_to_name"]] = np.sort(cc[["stop_from_name", "stop_to_name"]], axis=1)
    cc["count"] = 1
    cc = cc.groupby(["stop_from_name", "stop_to_name"]).sum().reset_index()
    cc = [(s["stop_from_name"], s["stop_to_name"], s["count"]) for _, s in cc.iterrows()]
    return cc
def build_graph(conns):
    edges = extract_edges(conns)
    g = nx.Graph()
    g.add_weighted_edges_from(edges)
    return g
G = build_graph(conns)
G.remove_node("Praha Masarykovo nádr.")
G.remove_node("Pha hl.n. Lc201,202")

🎨 Graph and Subgraph Visualization

  1. The overall coverage of Prague (and surrounding areas).
  2. Prague city center.
  3. Dejvice.
  4. Charles Square.
  5. Main Railway Station.
  6. Lesser Town.
def subgroup_map_filter(g, m, lat="stop_lat", lon="stop_lon"):
    def filter_subgroup(lbc, urc):
        filtered = m[(lbc[0] < m[lat]) & (m[lat] < urc[0]) & (lbc[1] < m[lon]) & (m[lon] < urc[1])]
        return g.subgraph(filtered.index)

    return filter_subgroup

def subgroup_distance_filter(g, m, lat="stop_lat", lon="stop_lon"):
    def filter_subgroup(coor, dist):
        x, y = coor
        dx = m[lat] - x
        dy = m[lon] - y
        dists = np.sqrt(dx * dx + dy * dy)
        filtered = m[dists < dist]
        return g.subgraph(filtered.index)
    return filter_subgroup

def subgroup_distance_filter_inv(g, m, lat="stop_lat", lon="stop_lon"):
    def filter_subgroup(coor, dist):
        x, y = coor
        dx = m[lat] - x
        dy = m[lon] - y
        dists = np.sqrt(dx * dx + dy * dy)
        filtered = m[dists > dist]
        return g.subgraph(filtered.index)
    return filter_subgroup

def add_scalebar(ax, s, loc="lower right", label_top=False):
    txt = ""
    if s > 0.008:
        txt = f"{round(s * 10**2)} km"
    else:
        txt = f"{round(s * 10**5)} m"
    scalebar = AnchoredSizeBar(ax.transData, s, txt, loc, label_top=label_top, fontproperties=font, pad=0.4)
    ax.add_artist(scalebar)
filter = subgroup_map_filter(G, stops)
filter_d = subgroup_distance_filter(G, stops)
filter_d_inv = subgroup_distance_filter_inv(G, stops)
def add_map(ax, source, scale, reset_extent=True):
    ctx.add_basemap(ax=ax, crs="WGS84", source=ctx.providers.CartoDB.Positron, reset_extent=True, attribution=False)
    ax.grid(False)
    add_scalebar(ax, scale)

def draw_part(ax, title, title_loc, A, B, radius, node_param, edge_param):
    ax.set_title(title, loc=title_loc)
    ax.set_xlim(A[1], B[1])
    ax.set_ylim(A[0], B[0])
    M = (np.array(A) + np.array(B)) / 2
    g = filter_d(M, radius)
    nx.draw_networkx_nodes(g, pos=stop_pos, ax=ax, node_size=node_param["size"], node_color=node_param["color"], alpha=node_param["alpha"])
    nx.draw_networkx_edges(g, pos=stop_pos, ax=ax, width=edge_param["size"], edge_color=edge_param["color"], alpha=edge_param["alpha"])

def draw_part_with_label(ax, title, title_loc, A, B, radius, params):
    ax.set_title(title, loc=title_loc)
    ax.set_xlim(A[1], B[1])
    ax.set_ylim(A[0], B[0])
    M = (np.array(A) + np.array(B)) / 2
    g = filter_d(M, radius)
    nx.draw_networkx(g, pos=stop_pos, ax=ax, node_size=params["node_size"], node_color=params["node_color"], width=params["width"],
                     font_size=params["font_size"], edge_color=params["edge_color"], with_labels=True)
centre = ["Centre", [50.07347057153624, 14.396852618086926], [50.11329080577483, 14.44677177212586], 0.04]
dejvice = ["Dejvice", [50.09122027369268, 14.385020433301096], [50.10756165438437, 14.403450996019187], 0.01]
charles = ["Charles Square", [50.069188151071394, 14.40950684182552], [50.08070937389346, 14.425281091947769], 0.02]
station = ["Main Railway Station", [50.07344788999524, 14.4278631463291], [50.088977084459446, 14.44535284907064], 0.02]
lesser = ["Lesser Town Square", [50.075600319200056, 14.392449133622146], [50.09242635204253, 14.414991205632134], 0.03]
nparam1 = {"size": 30, "color": red, "alpha": 0.8}
eparam1 = {"size": 1.5, "color": red, "alpha": 0.6}
params1 = {"node_size": 60, "node_color": redl, "width": 1.5, "font_size": 10, "edge_color": redl}
fig, axes = plt.subplots(3, 2, figsize=(12, 15), layout="constrained")
fig.suptitle("Coverage maps", fontsize=20)

ax = axes[0][0]
ax.set_title("All stops", loc="left")
nx.draw_networkx_nodes(G, pos=stop_pos, ax=ax, node_size=10, node_color=red, alpha=0.3)
add_map(ax, ctx.providers.CartoDB.Positron, 0.5)

ax = axes[0][1]
loc = centre
draw_part(ax, loc[0], "right", loc[1], loc[2], loc[3], nparam1, eparam1)
add_map(ax, ctx.providers.CartoDB.Positron, 0.01)

ax = axes[1][0]
loc = dejvice
draw_part_with_label(ax, loc[0], "left", loc[1], loc[2], loc[3], params1)
add_map(ax, ctx.providers.CartoDB.Positron, 0.002)

ax = axes[1][1]
loc = charles
draw_part_with_label(ax, loc[0], "right", loc[1], loc[2], loc[3], params1)
add_map(ax, ctx.providers.CartoDB.Positron, 0.002)

ax = axes[2][0]
loc = station
draw_part_with_label(ax, loc[0], "left", loc[1], loc[2], loc[3], params1)
add_map(ax, ctx.providers.CartoDB.Positron, 0.0025)

ax = axes[2][1]
loc = lesser
draw_part_with_label(ax, loc[0], "right", loc[1], loc[2], loc[3], params1)
add_map(ax, ctx.providers.CartoDB.Positron, 0.0025)

png

🚈 Coverage by Transport Mode

This section visualizes the coverage area of selected transport types: buses, trams, trains, and metro.

def filter_dist(route, dist):
    g = route_graph(conns, route)
    H = nx.Graph()
    H2 = nx.Graph()
    l1 = []
    for e1, e2 in g.edges:
        p1 = stops.loc[e1].to_numpy()
        p2 = stops.loc[e2].to_numpy()
        if np.linalg.norm(p1 - p2) < dist:
            l1.append((e1, e2))
    H.add_edges_from(l1)
    return H

def route_graph(cc, route_type):
    H = nx.Graph()
    h_edges = extract_edges(cc[cc["route_type"] == route_type])
    H.add_weighted_edges_from(h_edges)
    return H
fig, ax = plt.subplots(1, 1, figsize=(9, 9), layout="tight")
route = "Bus"
ax.set_title(f"{route} routes")
h1 = filter_dist(route, 0.05)
h2 = filter_dist(route, 0.15)
nx.draw_networkx(h2, pos=stop_pos, ax=ax, with_labels=False, node_size=0, node_color=redl, width=0.5, edge_color=redl, alpha=0.5)
nx.draw_networkx(h1, pos=stop_pos, ax=ax, with_labels=False, node_size=3, node_color=redl, width=1, edge_color=redl)
add_map(ax, ctx.providers.Esri.WorldGrayCanvas, 0.5)

fig, ax = plt.subplots(1, 1, figsize=(9, 9), layout="tight")
route = "Tram"
ax.set_title(f"{route} routes")
nx.draw_networkx(route_graph(conns, route), pos=stop_pos, ax=ax, with_labels=False, node_size=15, node_color=redf, width=1.4, edge_color=redl)
add_map(ax, ctx.providers.Esri.WorldGrayCanvas, 0.05)

fig, ax = plt.subplots(1, 1, figsize=(9, 9), layout="tight")
route = "Rail"
ax.set_title(f"{route} routes")
h1 = filter_dist(route, 0.1)
h2 = filter_dist(route, 1)
nx.draw_networkx(h2, pos=stop_pos, ax=ax, with_labels=False, node_size=10, node_color=redf, width=1, edge_color=redl, alpha=0.5, style="dashed")
nx.draw_networkx(h1, pos=stop_pos, ax=ax, with_labels=False, node_size=10, node_color=redf, width=1.4, edge_color=redl)
add_map(ax, ctx.providers.Esri.WorldGrayCanvas, 0.5)

fig, ax = plt.subplots(1, 1, figsize=(9, 9), layout="tight")
route = "Metro"
ax.set_title(f"{route} routes")
nx.draw_networkx(route_graph(conns, route), pos=stop_pos, ax=ax, with_labels=False, node_size=50, node_color=redf, width=3, edge_color=redf, alpha=0.95)
add_map(ax, ctx.providers.Esri.WorldGrayCanvas, 0.05)

png

png

png

png

📝 Notes

  • Of the transport modes shown, buses and trams have the densest lines, considering the scale.
  • Prague Integrated Transport (PID) covers a significant part of the Czech Republic, not just Prague.

The Rail routes graph shows short connections (<10 km) with solid lines, long connections (<110 km) with dashed lines, and connections longer than 110 km are not displayed.

📊 Node Analysis

Node analysis is performed using centrality measures. The following visualization displays selected stop centralities in the center of Prague.

A deeper node analysis will then be performed on the entire PID network.

def make_plotter(g, lbc, urc):
    # https://stackoverflow.com/a/67230951/13022894
    def plot_centrality(ax, centrality, *, label_count=0, **kwargs):
        ax.set_title(centrality.__name__)
        ax.set_xlim(lbc[1], urc[1])
        ax.set_ylim(lbc[0], urc[0])
        deg_cent = centrality(g, **kwargs)
        degree_centrality = pd.Series(deg_cent).sort_values(ascending=False)
        colors = degree_centrality / np.max(degree_centrality) * 5000
        cent = np.fromiter(deg_cent.values(), float)
        normalize = mcolors.Normalize(vmin=cent.min(), vmax=cent.max())
        scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
        scalarmappaple.set_array(cent)
        fig.colorbar(scalarmappaple, ax=ax)
        colors = cent / np.max(cent) * 200
        nx.draw(g, stop_pos, ax=ax, node_size=colors, node_color=colors, cmap=colormap, edge_color=grey, width=0.6)
        nx.draw_networkx_labels(g.subgraph(degree_centrality[:label_count].index), stop_pos, ax=ax, verticalalignment='bottom')
        add_map(ax, ctx.providers.Esri.WorldGrayCanvas, 0.01)
    return plot_centrality
fig, axes = plt.subplots(2, 2, figsize=(13, 9), layout="constrained")
fig.suptitle("Network centralities - Centre of Prague", fontsize=20)
colormap = cm.YlOrRd

g = filter_d((np.array(centre[1]) + np.array(centre[2])) / 2, 0.1)
plot_centrality = make_plotter(g, centre[1], centre[2])
plot_centrality(axes[0][0], nx.degree_centrality, label_count=1)
plot_centrality(axes[0][1], nx.closeness_centrality)
plot_centrality(axes[1][0], nx.eigenvector_centrality, weight="weight", label_count=2)
plot_centrality(axes[1][1], nx.betweenness_centrality, weight="weight", label_count=6)

png

🗨️ Interpreting Centralities in Prague Centre

🌡️ Degree Centrality

  • The metro stations show the highest significance by degree, acting as hubs/authorities in the transport network.

📐 Closeness Centrality

  • In the dense center of Prague, all stops are significant in terms of closeness centrality.

⚖️ Eigenvector Centrality (Weighted Edges)

  • This centrality indicates that Karlovo náměstí metro station is a very important node in this part of Prague, with a strong influence on its neighbors.
  • (It quantifies the overall importance of nodes, the importance of edges originating from them, and the importance of their neighbors.)

🖇️ Betweenness Centrality

  • This metric provides information about the importance of individual nodes in relation to other shortest paths.
  • The red metro line C is particularly interesting for this section, as most of the shortest connections flow through it.

🌡️ Degree Centrality

This section focuses on analyzing degree centrality across the entire graph.

degrees = pd.Series([d for n, d in G.degree()])
degrees = degrees[degrees < 10]
fig, axes = plt.subplots(2, 1, figsize=(9, 4), sharex=True)
fig.suptitle("Network degree histogram (<10)", size=15)

ax = axes[0]
ax.hist(degrees, bins=(degrees.max() - degrees.min()).astype(int))
ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0), useMathText=True)
ax.set_yticks([1000, 2000, 3000, 4000])
ax.set_ylabel("Count")
ax.tick_params(labelbottom=False)
ax.tick_params(axis="x", which="both", length=0)

medianprops = dict(linewidth=2.5, color=(1, 0.38, 0.27))
flierprops = dict(marker="d", markerfacecolor=(1, 0.38, 0.27), markersize=8, markeredgecolor="none")
ax = axes[1]
ax.boxplot(degrees, vert=False, widths=[0.4], showfliers=True, flierprops=flierprops, medianprops=medianprops)
ax.get_yaxis().set_visible(False)
ax.set_xlabel("Degree")

fig.subplots_adjust(hspace=0.03)

png

def display_top(ps):
    df = pd.DataFrame(ps, columns=["Coeff"])
    styler = df.style.format(precision=7, thousands=" ", decimal=".")
    display(styler)
degree_centrality = pd.Series(nx.degree_centrality(G)).sort_values(ascending=False)
top10 = degree_centrality[:10]

print("Top 10 nodes with the highest degree centrality")
display_top(top10)

neighbours = set()
for n in top10.index:
    for u in nx.all_neighbors(G, n):
        neighbours.add(u)
for _ in range(3):
    nn = neighbours.copy()
    for n in nn:
        for u in nx.all_neighbors(G, n):
            neighbours.add(u)
cumulative = list(neighbours)
cumulative.extend(list(top10.index))
len(cumulative)

fig, ax = plt.subplots(1, 1, figsize=(12, 7), layout="constrained")
fig.suptitle("10 nodes with the highest degree centrality and their 3-neighbourhood", fontsize=15)
nx.draw_networkx(G.subgraph(cumulative), pos=stop_pos, ax=ax, node_size=10, node_color=blue, with_labels=False, alpha=0.1, edge_color=blue, width=1)
nx.draw_networkx(G.subgraph(top10.index), pos=stop_pos, ax=ax, node_size=50, node_color=redl, font_size=10, width = 0)
add_map(ax, ctx.providers.CartoDB.Positron, 0.5, False)

A = [49.5568518946815, 13.510323360859733]
B = [50.709352011905555, 15.35150400382775]
ax.set_xlim(A[1], B[1])
ax.set_ylim(A[0], B[0]);
Top 10 nodes with the highest degree centrality
 Coeff
Zličín0.0022995
Lihovar0.0021642
Černý Most0.0020289
Mladá Boleslav,aut.st.0.0018937
Kladno,Pražská křiž.0.0017584
Chrášťany0.0017584
Karlovo náměstí0.0017584
Jesenice0.0016232
Kobylisy0.0016232
Turnov,Terminál u žel.st.0.0016232

png

📝 Description

Nodes with the highest degree centrality are primarily large bus terminals. Connections from these nodes typically extend outside Prague, which is characteristic of bus hubs.

🏠 Neighborhood

The following maps visualize the immediate neighbors of these highly central nodes. With a few exceptions, these nodes’ neighbors are quite distant.

fig, axes = plt.subplots(3, 3, figsize=(12, 8), layout="tight")
fig.suptitle("Hubs and their neighborhoods", fontsize=16)
scales = [0.06, 0.1, 0.2, 0.2, 0.03, 0.2, 0.005, 0.2, 0.01]
for node, ax, scale in zip(top10.index[:9], axes.reshape(-1), scales):
    neighbours = set([node])
    for u in nx.all_neighbors(G, node):
        neighbours.add(u)

    ax.set_title(node)
    nx.draw_networkx(G.subgraph(neighbours), pos=stop_pos, ax=ax, node_size=30, node_color=grey,
                     with_labels=False, alpha=0.8, edge_color=grey, width=1.6)
    nx.draw_networkx(G.subgraph([node]), pos=stop_pos, ax=ax, node_size=100, node_color=redf, alpha=0.95, with_labels=False)
    add_map(ax, ctx.providers.CartoDB.Positron, scale, True)

png

📐 Closeness Centrality

centrality = pd.Series(nx.closeness_centrality(G)).sort_values(ascending=False)
lbc = [50.03774851490303, 14.331014307052019]
urc = [50.13010922185579, 14.500335441536306]
nodes = filter(lbc, urc)

f = centrality[list(nodes)].sort_values(ascending=False)
f = f.drop('Nad Pahorkem')
f = f.drop('Choceradská')
top10 = f[1:11]
print("Top 10 nodes with the highest closeness centrality")
display_top(top10)

neighbours = set()
for n in top10.index:
    for u in nx.all_neighbors(G, n):
        neighbours.add(u)
for _ in range(10):
    nn = neighbours.copy()
    for n in nn:
        for u in nx.all_neighbors(G, n):
            neighbours.add(u)
cumulative = list(neighbours)
cumulative.extend(list(top10.index))
len(cumulative)

fig, ax = plt.subplots(1, 1, figsize=(12, 6), layout="constrained")
fig.suptitle("10 nodes with the highest closeness centrality", fontsize=15)
ax.set_xlim(lbc[1], urc[1])
ax.set_ylim(lbc[0], urc[0])
nx.draw_networkx(G.subgraph(cumulative), pos=stop_pos, ax=ax, node_size=10, node_color=blue, with_labels=False, alpha=0.2, edge_color=blue, width=1)
nx.draw_networkx(G.subgraph(top10.index), pos=stop_pos, ax=ax, node_size=150, node_color=redl, font_size=10, width=0)
add_map(ax, ctx.providers.CartoDB.Positron, 0.01)
Top 10 nodes with the highest closeness centrality
 Coeff
Kačerov0.0741755
Na Veselí0.0740663
Lihovar0.0734723
Smíchovské nádraží0.0727514
Hlavní nádraží0.0721860
I. P. Pavlova0.0721479
Budějovická0.0715759
Pankrác0.0709528
Radlická0.0704498
Benkova0.0700958

png

🗨️ Interpretation

The highlighted stops are crucial points in the PID transport network. They have the lowest average distance to all other stops.

🖇️ Betweenness Centrality

centrality_bet = pd.Series(nx.betweenness_centrality(G, weight="weight", k=200)).sort_values(ascending=False)
lbc = [50.03774851490303, 14.331014307052019]
urc = [50.13010922185579, 14.500335441536306]
nodes = filter(lbc, urc)

top10 = centrality_bet[list(nodes)].sort_values(ascending=False)[:10]
print("Top 10 nodes with highest betweeness centrality")
display_top(top10)

neighbours = set()
for n in top10.index:
    for u in nx.all_neighbors(G, n):
        neighbours.add(u)
for _ in range(10):
    nn = neighbours.copy()
    for n in nn:
        for u in nx.all_neighbors(G, n):
            neighbours.add(u)
cumulative = list(neighbours)
cumulative.extend(list(top10.index))
len(cumulative)

fig, ax = plt.subplots(1, 1, figsize=(12, 6), layout="constrained")
fig.suptitle("10 nodes with the highest betweeness centrality", fontsize=15)
ax.set_xlim(lbc[1], urc[1])
ax.set_ylim(lbc[0], urc[0])
nx.draw_networkx(G.subgraph(cumulative), pos=stop_pos, ax=ax, node_size=10, node_color=blue, with_labels=False, alpha=0.2, edge_color=blue, width=1)
nx.draw_networkx(G.subgraph(top10.index), pos=stop_pos, ax=ax, node_size=150, node_color=redl, font_size=10, width=0)
add_map(ax, ctx.providers.CartoDB.Positron, 0.01)
Top 10 nodes with highest betweeness centrality
 Coeff
Na Veselí0.0889256
I. P. Pavlova0.0714338
Botanická zahrada Troja0.0397029
Hlavní nádraží0.0388959
Pankrác0.0383901
Střížkov0.0383537
Poliklinika Budějovická0.0358492
Zoologická zahrada0.0336205
Parkoviště Trojský most0.0325963
Prosek0.0313351

png

🗨️ Interpretation

The highlighted stops are frequently part of the shortest paths between other pairs of nodes.

❓ Insights

How does transport frequency change between weekdays and weekends, and across different transport types?

fig, axes = plt.subplots(6, figsize=(9, 10), sharex=True)
fig.suptitle(f"Activity by Route Types", y=0.92, fontsize=16)
fig.subplots_adjust(hspace=0.08)

fig.text(0.5, 0.04, "Time [hour]", ha="center", size=12)
fig.text(0.04, 0.5, "Count", va="center", rotation="vertical", size=12)

for route_type, ax in zip(types.index, fig.axes):
    d = conns[(conns["route_type"] == route_type) & (conns["monday"] | conns["tuesday"] | conns["wednesday"] | conns["thursday"] | conns["friday"])]
    series = d["depart_from_H"] * 3600 + d["depart_from_M"] * 60 + d["depart_from_S"]
    series = series / 3600
    n, _, _ = ax.hist(series, bins=48, alpha=0.8)
    ax.set_title(f"{route_type}", y=1.0, pad=-14, loc="left", color=grey)
    ax.set_yticks([0, max(n) * 2 / 4, max(n)])
    ax.set_ylim(0, max(n) * 1.4)
    ax.tick_params(axis="y", which="both", length=0)
    ax.grid(which="major", axis="x", color="grey", linestyle="dotted", linewidth=0.4)

    d = conns[(conns["route_type"] == route_type) & (conns["saturday"] | conns["saturday"])]
    series = d["depart_from_H"] * 3600 + d["depart_from_M"] * 60 + d["depart_from_S"]
    series = series / 3600
    n, _, _ = ax.hist(series, bins=48, color="k", histtype="step", linewidth=1, alpha=0.8)

handles, labels = plt.gca().get_legend_handles_labels()
patch = mpatches.Patch(color=red, label="Weekday", alpha=0.8)
line = Line2D([0], [0], label="Weekend", color="k", linewidth=1)
handles.extend([patch, line])
axes[0].legend(handles=handles, bbox_to_anchor=(1.04, 1), loc="upper left", prop={'size': 12})
ax.set_xticks(range(0, 25, 4))
ax.set_xlim(-1, 25);

png

🗨️ Interpretation

  • Transport activity is significantly lower on weekends. The exceptions are ferries and the funicular, which operate similarly on weekends. Metro and train frequencies are also comparable to weekdays outside of peak hours.
  • On weekdays, there are more connections during morning and evening rush hours, aligning with commuter travel patterns. This adaptation is not observed on weekends.
  • Overall, the frequency of connections is very low between 11 PM and 4 AM. For ferries and the funicular, it’s zero.

Do important stops change between day and night operations?

For this question, stop importance is analyzed using degree centrality.

G_weekday = build_graph(conns[conns['monday'] | conns['tuesday'] | conns['wednesday'] | conns['thursday'] | conns['friday']])
filter_d_weekday = subgroup_distance_filter(G_weekday, stops)

G_weekend = build_graph(conns[conns['saturday'] | conns['sunday']])
filter_d_weekend = subgroup_distance_filter(G_weekend, stops)

G_day = build_graph(conns[~conns['is_night']])
filter_d_day = subgroup_distance_filter(G_day, stops)

G_night = build_graph(conns[conns['is_night']])
filter_d_night = subgroup_distance_filter(G_night, stops)
loc = ['', [50.04909793668076, 14.379518399273855], [50.112147437060834, 14.481384833754964], 0.4]

fig, axes = plt.subplots(2, 1, figsize=(10, 12), layout="constrained")
fig.suptitle("Degree centrality - Central Prague", fontsize=20)

g = filter_d_day((np.array(loc[1]) + np.array(loc[2])) / 2, loc[3])
plot_centrality = make_plotter(g, loc[1], loc[2])
plot_centrality(axes[0], nx.degree_centrality, label_count=18)
axes[0].set_title("Daytime")

g = filter_d_night((np.array(loc[1]) + np.array(loc[2])) / 2, loc[3])
plot_centrality = make_plotter(g, loc[1], loc[2])
plot_centrality(axes[1], nx.degree_centrality, label_count=5)
axes[1].set_title("Nighttime");

png

🗨️ Interpretation

Yes, the importance of stops changes between day and night operations. For example, Lihovar, which is the most important stop by degree centrality during the day, is insignificant during night operations. Conversely, Anděl station becomes more significant at night.

Which out-of-Prague stops are the busiest?

As in the previous question, importance is again assessed by degree centrality.

mid = [50.089027446744275, 14.420374242931851]
g = filter_d_inv(mid, 0.2)
H = g.copy()
H.remove_node('Kladno,Pražská křiž.')
H.remove_node('Mladá Boleslav,Jičínská')
g = H
deg_cent = nx.degree_centrality(g)
degree_centrality = pd.Series(deg_cent).sort_values(ascending=False)
n = 6
top = degree_centrality[:n]
print(f"Top {n} nodes with the highest degree centrality")
display_top(top)

fig, ax = plt.subplots(1, 1, figsize=(14, 10), layout="constrained")
fig.suptitle(f"{n} nodes with the highest degree centrality (outside of Prague)", fontsize=16)

colors = degree_centrality / np.max(degree_centrality)
cent = np.fromiter(deg_cent.values(), float)
normalize = mcolors.Normalize(vmin=cent.min(), vmax=cent.max())
scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
scalarmappaple.set_array(cent)
fig.colorbar(scalarmappaple, ax=ax)
colors = cent / np.max(cent) * 200
nx.draw(g, stop_pos, ax=ax, node_size=colors*0.4, node_color=colors, cmap=colormap, width=0, alpha=0.4)
indices = []
for n, v in top.items():
    indices.append(pd.Index(g.nodes).get_loc(n))
nx.draw_networkx_nodes(g.subgraph(top.index), stop_pos, ax=ax, node_size=colors[indices]*0.3, node_color=redf, alpha=0.4)
nx.draw_networkx_labels(g.subgraph(top.index), stop_pos, ax=ax, verticalalignment='bottom', font_size=11)
add_map(ax, ctx.providers.CartoDB.Positron, 0.5)
Top 6 nodes with the highest degree centrality
 Coeff
Mladá Boleslav,aut.st.0.0021771
Kladno,autobusové nádraží0.0019956
Mšeno0.0019956
Turnov,Terminál u žel.st.0.0019956
Říčany,K Žel.st.0.0019956
Mutějovice0.0018142

png

🗨️ Interpretation

Similar to the overall graph’s degree centrality, the most important stops outside of Prague are large bus stations.

Which stops are most significant during weekday mornings (6:00 - 11:00 AM)?

conns_morning = (conns['depart_from_H'] >= 6) & (conns['depart_from_H'] <= 11)
conns_weekday = (conns['monday'] | conns['tuesday'] | conns['wednesday'] | conns['thursday'] | conns['friday'])
G_morning_weekday = build_graph(conns[conns_morning & conns_weekday])
filter_d_mwd = subgroup_distance_filter(G_morning_weekday, stops)
g = filter_d_mwd(mid, 0.02)

deg_cent = nx.eigenvector_centrality(g, weight="weight", max_iter=1000)
degree_centrality = pd.Series(deg_cent).sort_values(ascending=False)
n = 15
top = degree_centrality[:n]
print(f"Top {n} nodes with the highest eigenvector centrality")
display_top(top)

fig, ax = plt.subplots(1, 1, figsize=(12, 10), layout="constrained")
fig.suptitle(f"{n} nodes with the highest eigenvector centrality on weekday morning", fontsize=16)

colors = degree_centrality / np.max(degree_centrality)
cent = np.fromiter(deg_cent.values(), float)
# normalize = mcolors.Normalize(vmin=cent.min(), vmax=cent.max())
# scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
# scalarmappaple.set_array(cent)
# fig.colorbar(scalarmappaple, ax=ax)
colors = cent / np.max(cent) * 200
nx.draw(g, stop_pos, ax=ax, node_size=colors*0.4, node_color=colors, cmap=colormap, width=0, alpha=0.6)
indices = []
for n, v in top.items():
    indices.append(pd.Index(g.nodes).get_loc(n))
nx.draw_networkx_nodes(g.subgraph(top.index), stop_pos, ax=ax, node_size=colors[indices]*0.3, node_color=redf, alpha=0.8)
nx.draw_networkx_labels(g.subgraph(top.index), stop_pos, ax=ax, verticalalignment='bottom', font_size=11)
add_map(ax, ctx.providers.CartoDB.Positron, 0.01, False)
Top 15 nodes with the highest eigenvector centrality
 Coeff
Karlovo náměstí0.4855651
Národní třída0.4425812
Štěpánská0.3632039
Národní divadlo0.3594138
Novoměstská radnice0.2402880
Lazarská0.2090533
Újezd0.2071608
I. P. Pavlova0.2038303
Moráň0.1490697
Vodičkova0.1294884
Hellichova0.1234722
Palackého náměstí0.1125779
Malostranské náměstí0.0819371
Václavské náměstí0.0787817
Staroměstská0.0783107

png

🗨️ Interpretation

The highlighted stops are the most significant on weekday mornings (6:00 AM - 11:00 AM). In the PID network, they connect influential nodes and form a crucial part of important connections.

Open Data Attribution

Copyright and license for the openly available geographical data used in the visualizations: