This notebook shows how to get neighbour nodes of a given protein in pypath. It also looks for the shortest paths between two given nodes. This can be used to extract a smaller network that connects some nodes of interest.
# Show all the plots inside the notebook
%matplotlib inline
# load packages
import pypath
import igraph # import igraph to use the plot function
import numpy as np
import pandas as pd
import seaborn as sns
# Load the ipython display and image module
from IPython.display import Image
from IPython.display import display
pa = pypath.main.PyPath()
pa.init_network()
# remove links reported in papers with more than 50 interactions (by default)
pa.remove_htp()
After loading OmniPath, we will prepare a list of the proteins we want to query. The following are the 5 most frequently mutated genes in prostate cancer.
query_nodes = set(['PTEN', 'FOXA1', 'TP53', 'SPOP', 'AR'])
for igene in query_nodes:
# to query a node based on the value of an attribute we can use the igraph find() method
#prot = pa.graph.vs.find(label=i)['name']
# if the attribute is the vertex label (genesymbol) we can use pypath's genesymbol() function
prot = pa.genesymbol(igene)['name']
#neighbours_of_prot = pa.first_neighbours(prot)
neighbours_of_prot = list(pa.gs_neighbors(igene).gs())
print('{} ({}) has {} neighbours:'.format(igene, prot, len(neighbours_of_prot)))
if len(neighbours_of_prot)<10:
print(neighbours_of_prot)
else:
print('(showing only 10 proteins)')
print(neighbours_of_prot[0:10])
print('---')
# modify some of the visual style settings for the igraph plotting function
visual_style = {'bbox': (300, 300),
'margin': 50}
# get neighbourhood graphs for each of the query nodes
subgraph = {}
for igene in query_nodes:
subgraph[igene] = pa.neighbourhood_network(pa.genesymbol(igene)['name'])
igraph.plot(subgraph[igene], layout=subgraph[igene].layout_auto(), **visual_style)
# plot neighbourhood of SPOP
igene = 'SPOP'
print(subgraph[igene].vs['label'])
plot2 = igraph.plot(subgraph[igene], layout=subgraph[igene].layout_auto(), **visual_style)
plot2.save('neigbourhood_SPOP.png')
display(Image('neigbourhood_SPOP.png'))
# for some reason, the node labels are not always correctly displayed inside IPython notebook
# however, they appear correctly if printed to a file
igene = 'FOXA1'
print(subgraph[igene].vs['label'])
plot2 = igraph.plot(subgraph[igene], layout=subgraph[igene].layout_auto(), **visual_style)
plot2.save('neigbourhood_FOXA1.png')
display(Image('neigbourhood_FOXA1.png'))
# the *.pdf file generated after executing this line contains the graph with the correct labels
#igraph.plot(subgraph[igene], 'FOXA1_neighbourhood.pdf', layout=subgraph[igene].layout_auto(), **visual_style)
# find shortest path between SPOP and FOXA1
path = pa.graph.get_shortest_paths(pa.genesymbol('SPOP')['name'], to=pa.genesymbol('FOXA1')['name'])
# the result is returned as a list with a single element
path = path[0]
path_SPOP_to_FOXA1_length = len(path)-1
print('The path from SPOP to FOXA1 has {} steps:'.format(path_SPOP_to_FOXA1_length))
print('\t' + ' --> '.join(pa.graph.vs[i]['label'] for i in path))
# find shortest path between FOXA1 and SPOP
path = pa.graph.get_shortest_paths(pa.genesymbol('FOXA1')['name'], to=pa.genesymbol('SPOP')['name'])
# alternative way of showing path members
for i in path[0]:
print(pa.graph.vs[i]['label'])
# find all paths between SPOP and FOXA1 (of length equal to the shortest path length)
# to find the index based on the value of an attribute we can use igraph's select() function
#node_start = pa.graph.vs.select(label='SPOP').indices[0]
#node_end = pa.graph.vs.select(label='FOXA1').indices[0]
# or, if the attribute is the gene symbol, we can use pypath's genesymbol() function
node_start = pa.genesymbol('SPOP').index
node_end = pa.genesymbol('FOXA1').index
paths = pa.find_all_paths(start=node_start, end=node_end, maxlen=path_SPOP_to_FOXA1_length)
print('Number of paths: {}'.format(len(paths)))
p = [item for sublist in paths for item in sublist]
p = set(p)
print('Number of nodes: {}'.format(len(p)))
# extract graph expanded by the nodes included in the path
connection_graph = pa.graph.induced_subgraph(p)
plot2 = igraph.plot(connection_graph, layout=connection_graph.layout_auto(), **visual_style)
plot2.save('connection_SPOP_FOXA1.png')
display(Image('connection_SPOP_FOXA1.png'))
igraph.plot(connection_graph.get_adjacency())
xs, ys = zip(*[(left, count) for left, _, count in connection_graph.degree_distribution().bins()])
sns.plt.bar(xs, ys)
sns.plt.title('Degree distribution of shortest path network between SPOP and FOXA1')
degree_threshold = 10
label_tmp = [node if d>degree_threshold else '\n' for node, d in zip(connection_graph.vs['label'], connection_graph.degree())]
plot2 = igraph.plot(connection_graph, layout=connection_graph.layout_auto(), vertex_label=label_tmp, vertex_size=connection_graph.degree(), vertex_color='#ff000022', **visual_style)
plot2.save('connection_SPOP_FOXA1_(v2).png')
display(Image('connection_SPOP_FOXA1_(v2).png'))
label_high_degree = [node for node, d in zip(connection_graph.vs['label'], connection_graph.degree()) if d>degree_threshold]
print('Nodes with degree greater than {}:'.format(degree_threshold))
print(label_high_degree)
# find all paths between SPOP and FOXA1 (of maximum length equal to the shortest path length + 1)
new_maxlen = path_SPOP_to_FOXA1_length + 1
node_start = pa.genesymbol('SPOP').index
node_end = pa.genesymbol('FOXA1').index
paths = pa.find_all_paths(start=node_start, end=node_end, maxlen=new_maxlen)
print('Number of paths: {}'.format(len(paths)))
p = [item for sublist in paths for item in sublist]
p = set(p)
print('Number of nodes: {}'.format(len(p)))
connection_graph = pa.graph.induced_subgraph(p)
plot2 = igraph.plot(connection_graph, layout=connection_graph.layout_auto(), **visual_style)
plot2.save('connection_SPOP_FOXA1_l5.png')
display(Image('connection_SPOP_FOXA1_l5.png'))
xs, ys = zip(*[(left, count) for left, _, count in connection_graph.degree_distribution().bins()])
sns.plt.bar(xs, ys)
sns.plt.title('Degree distribution of shortest path network (min_length+1) between SPOP and FOXA1')
label_tmp = [node if d>90 else '\n' for node, d in zip(connection_graph.vs['label'], connection_graph.degree())]
plot2 = igraph.plot(connection_graph, layout=connection_graph.layout_auto(), vertex_label=label_tmp, vertex_size=connection_graph.degree(), vertex_color='#ff000022', **visual_style)
plot2.save('connection_SPOP_FOXA1_l5_(v2).png')
display(Image('connection_SPOP_FOXA1_l5_(v2).png'))
degree_threshold = 40
label_high_degree = [node for node, d in zip(connection_graph.vs['label'], connection_graph.degree()) if d>degree_threshold]
print('Nodes with degree greater than {}:'.format(degree_threshold))
print(label_high_degree)
We will now try to obtain a network composed of nodes involved in the first shortest path found between each pair of nodes.
node_list = set()
distance = pd.DataFrame(np.nan, index=query_nodes, columns=query_nodes)
for igene1 in query_nodes:
for igene2 in query_nodes:
if igene1 == igene2:
distance.loc[igene1, igene2] = 0
else:
path = pa.graph.get_shortest_paths(pa.genesymbol(igene1)['name'], to=pa.genesymbol(igene2)['name'])[0]
#node_list = node_list.union(set(path))
node_list.update(path)
distance.loc[igene1, igene2] = len(path)-1 if len(path)>0 else np.nan
interconnection_graph = pa.graph.induced_subgraph(node_list)
plot2 = igraph.plot(interconnection_graph, layout=interconnection_graph.layout_auto(), **visual_style)
plot2.save('connection_5nodes.png')
display(Image('connection_5nodes.png'))
pa.get_directed()
dnode_list = set()
ddistance = pd.DataFrame(np.nan, index=query_nodes, columns=query_nodes)
for igene1 in query_nodes:
for igene2 in query_nodes:
if igene1 == igene2:
ddistance.loc[igene1, igene2] = 0
else:
path = pa.dgraph.get_shortest_paths(pa.dgenesymbol(igene1)['name'], to=pa.dgenesymbol(igene2)['name'])[0]
dnode_list.update(path)
ddistance.loc[igene1, igene2] = len(path)-1 if len(path)>0 else np.nan
interconnection_dgraph = pa.dgraph.induced_subgraph(dnode_list)
plot2 = igraph.plot(interconnection_dgraph, layout=interconnection_dgraph.layout_auto(), **visual_style)
plot2.save('connection_d_5nodes.png')
display(Image('connection_d_5nodes.png'))
Now we focus on extracting an undirected network containing all the nodes present in all the possible shortest paths between our genes of interest.
node_list_all = set()
for igene1 in query_nodes:
for igene2 in query_nodes:
if igene1 != igene2:
paths = pa.find_all_paths(start=pa.genesymbol(igene1).index, end=pa.genesymbol(igene2).index, maxlen=distance.loc[igene1, igene2])
for sublist in paths:
node_list_all.update(sublist)
interconnection_all_graph = pa.graph.induced_subgraph(node_list_all)
plot2 = igraph.plot(interconnection_all_graph, layout=interconnection_all_graph.layout_auto(), **visual_style)
plot2.save('connection_5nodes_asp.png')
display(Image('connection_5nodes_asp.png'))
Once we have our network, we can save into a file for analyzing it in different programs or with different algorithms.
We will count the number of references for each edge and store it in a new edge attribute. Then, we can use the function write_ncol()
, available to any igraph object, to print a space delimited file with the label of the source and target vertices of each edge as well as an additional numerical attribute of each edge.
interconnection_dgraph.es['nrefs'] = [len(edge['references']) for edge in interconnection_dgraph.es]
# 'dsp' here wants to represent 'from directed network using shortest path'
file_to_write = 'prior_network_dsp.ncol'
interconnection_dgraph.write_ncol(file_to_write, names='label', weights='nrefs')
We can also define our own function for increased flexibility. For example, here we define a function to write the network information in a tab delimited file. This function allows to print several attributes of the source and target vertices as well as several attributes of each edge.
def custom_write(fname, graph, names=['name'], edge_attributes=[], sep='\t'):
"""
Write edge list to text file with attributes
@param fname: the name of the file or a stream to read from.
@param graph: the igraph object containing the network
@param names: list with the vertex attribute names to be printed for source and target vertices
@param edge_attributes: list with the edge attribute names to be printed
@param sep: string used to separate columns
"""
# check that input 'names' and 'edge_attributes' exist
names = [iname for iname in names if iname in graph.vs.attribute_names()]
edge_attributes = [eattr for eattr in edge_attributes if eattr in graph.es.attribute_names()]
# write file
with open(fname, 'wt') as fid:
# write header
for iname in names:
fid.write(sep.join(['{}_{}'.format(st, iname) for st in ('source', 'target')]))
fid.write(sep)
fid.write(sep.join(eattr for eattr in edge_attributes))
fid.write('\n')
# write data
for edge in graph.es:
for iname in names:
fid.write(sep.join([graph.vs[v][iname] for v in edge.tuple]))
fid.write(sep)
fid.write(sep.join(['{}'.format(edge[eattr]) for eattr in edge_attributes]))
fid.write('\n')
custom_write('prior_network_dsp.txt', interconnection_dgraph, names=['name', 'label'], edge_attributes=['nrefs'])
interconnection_all_graph.es['nrefs'] = [len(edge['references']) for edge in interconnection_all_graph.es]
# 'asp' here wants to represent 'from undirected network using all possible shortest paths'
custom_write('prior_network_asp.txt', interconnection_all_graph, names=['name', 'label'], edge_attributes=['nrefs'])