BGP Correlation Violin Plot

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline
#input file
#[prefix, mean(hopcount), ASPathLength, min(hopcount), max(hopcount), var(hopcount), stdev(hopcount)]

# files at leavenworth:/srv/2016-ba-wickenheiser-scheitle/bgpttl/result
#ipv6 file
filename2 = "bgpttl_20160319-073130.csv"
#ipv4 file
filename1 = "bgpttl_20160321-102030.csv"

#create dataframes
df1 = pd.read_csv(filename1)
df2 = pd.read_csv(filename2)

#rename keys cause they are a bit messed up in the original csv file (optional)
df1.columns=["prefix", "mean(hopcount)", "ASPathLength", "min(hopcount)", "max(hopcount)", "var(hopcount)", "stdev(hopcount)"]
df2.columns=["prefix", "mean(hopcount)", "ASPathLength", "min(hopcount)", "max(hopcount)", "var(hopcount)", "stdev(hopcount)"]

#add column for IPv
df1["IP Version"] = "IPv4"
df2["IP Version"] = "IPv6"

#combine dataframes for split violin plot
combined = df1.append(df2)

fig, ax = plt.subplots()
ax= sns.violinplot(x="ASPathLength", y="mean(hopcount)", hue="IP Version", data=combined, split=False, inner="box")#,scale="count")
ax.set(xlabel='AS Path Length', ylabel='Mean Hop Count per Prefix (capped at 40)')
# this would add regression lines
#sns.regplot(x="ASPathLength", y="mean(hopcount)" , ci= None, scatter=False, data=df1, color="g")
#sns.regplot(x="ASPathLength", y="mean(hopcount)" , ci= None, scatter=False, data=df2, color="b")
ax.set_ylim(ymin=0,ymax=40);#,xmax=100000);
In [9]:
fig.savefig('../../figures/bgpcorr-violin.pdf', format='pdf', dpi=2000)
In [10]:
%%bash
pdfcrop ../../figures/bgpcorr-violin.pdf
git add ../../figures/bgpcorr-violin-crop.pdf
PDFCROP 1.38, 2012/11/02 - Copyright (c) 2002-2012 by Heiko Oberdiek.
==> 1 page written on `../../figures/bgpcorr-violin-crop.pdf'.

Regression Statistics

In [11]:
from scipy import stats

#use scipy to calculate the linear regression parameters (not possible in seaborn at the moment)
x_ipv6 = df1["ASPathLength"].as_matrix()
y_ipv6 = df1["mean(hopcount)"].as_matrix()

x_ipv4 = df2["ASPathLength"].as_matrix()
y_ipv4 = df2["mean(hopcount)"].as_matrix()

#IPv6
print("IPv6:")
slope, intercept, r_value, p_value, std_err = stats.linregress(x_ipv6,y_ipv6)
print("slope: " + str(slope))
print("intercept: " + str(intercept))
print("r_value: " + str(r_value))
print("r^2: " + str(r_value**2)) 
print("p_value: " + '{0:.16f}'.format(p_value))
print("std_err: " + str(std_err))

#IPv4
print("IPv4:")
slope, intercept, r_value, p_value, std_err = stats.linregress(x_ipv4,y_ipv4)
print("slope: " + str(slope))
print("intercept: " + str(intercept))
print("r_value: " + str(r_value))
print("r^2: " + str(r_value**2)) 
print("p_value: " + '{0:.16f}'.format(p_value))
print("std_err: " + str(std_err))
IPv6:
slope: 1.1448628695
intercept: 7.74384353358
r_value: 0.228559775394
r^2: 0.0522395709281
p_value: 0.0000000000000000
std_err: 0.00994761208788
IPv4:
slope: 1.81736130153
intercept: 2.18436019233
r_value: 0.392976080625
r^2: 0.154430199943
p_value: 0.0000000014110876
std_err: 0.287361004777