In Lescarbeu et al. (2014), a phosphoproteomic dataset in prostate cancer cell lines is provided. This notebook recopilates the proteins considered, measured and perturbed in that work and tries to recover a prior-knowledge network supported by literature using pypath.
# Show all the plots inside the notebook
%matplotlib inline
# load packages
import pypath
import igraph # import igraph to use the plot function
import numpy as np
import pandas as pd
import seaborn as sns
# Load the ipython display and image module
from IPython.display import Image
from IPython.display import display
display(Image(url="https://static-content.springer.com/image/art%3A10.1186%2F1471-2407-14-325/MediaObjects/12885_2013_Article_4524_Fig1_HTML.jpg"))
Figure 1 An overview of the measured signaling pathways and responses to treatment. A) A diagrammatic overview illustrating the cell signaling between the phosphoproteins measured (blue lettering), the treatments used (green lettering), and the inhibitors which were used on the LNCaP cells (red lines). B) Heatmap with hierarchical clustering illustrating the mean centered and variance scaled (z-score) changes in phosphoprotein values in response to varying treatments (see Methods), as well as survival values of LNCaP, PC3, or MDA-PCa-2b cells as measured via a MTT assay. 
In Figure 1 of Lescarbeau et al. (2014) we can see an squema of the studied signaling network. We will translate the names in the diagram to gene IDs and try to recover a network using pypath.
# load file and keep only the second column
seed_list_table = pd.read_csv("../../Data/Lescarbeau_et_al_2014/model_nodes.txt", sep='\t', header=0)
seed_list = seed_list_table.iloc[:,1]
print(seed_list_table)
pa = pypath.main.PyPath()
pa.init_network()
# remove links reported in papers with more than 50 interactions (by default)
pa.remove_htp()
prot = [(igene, pa.genesymbol(igene)['name']) for igene in list(seed_list.as_matrix()) if pa.genesymbol(igene) is not None]
We can see first how the neighbourhood of the nodes listed looks like.
neighbour_set = set()
for igene, iprot in prot:
neighbour_set.update(pa.gs_neighborhood(igene))
neighbour_network = pa.graph.induced_subgraph(neighbour_set)
layout = neighbour_network.layout_fruchterman_reingold(repulserad = neighbour_network.vcount() ** 2.8,
maxiter = 1000, area = neighbour_network.vcount() ** 2.3)
plot2 = igraph.plot(neighbour_network, layout = layout, vertex_size = 0.5,
vertex_border_width = 0, edge_width = 0.2, edge_color = '#33333377', vertex_label_size = 9)
plot2.save('Lescarbeu_et_al_2014_neighbourhood.png')
display(Image('Lescarbeu_et_al_2014_neighbourhood.png'))
print('Number of edges: {}'.format(neighbour_network.ecount()))
print('Number of nodes: {}'.format(neighbour_network.vcount()))
The neighbours of the queried nodes span quite a big network. We will try to get a network only involving the nodes queried.
query_set = set()
for igene, iprot in prot:
query_set.add(pa.gs(igene))
query_set_network = pa.graph.induced_subgraph(query_set)
layout = query_set_network.layout_fruchterman_reingold(repulserad = query_set_network.vcount() ** 2.8,
maxiter = 1000, area = query_set_network.vcount() ** 2.3)
plot2 = igraph.plot(query_set_network, layout = layout, vertex_size = 0.5,
vertex_border_width = 0, edge_width = 0.2, edge_color = '#33333377', vertex_label_size = 9)
plot2.save('Lescarbeu_et_al_2014_set.png')
display(Image('Lescarbeu_et_al_2014_set.png'))
print('Number of edges: {}'.format(query_set_network.ecount()))
print('Number of nodes: {}'.format(query_set_network.vcount()))
This network looks much more manageable. There is one big connected component and only 3 nodes that lack a direct connection to the rest.
# extract connected components using igraph's clusters() function
query_set_clusters = query_set_network.clusters()
n_clusters = len(query_set_clusters)
print('Number of connected components: {}'.format(n_clusters))
for i in xrange(n_clusters):
print('\tComponent {} size: {}'.format(i, len(query_set_clusters[i])))
We can see what the members of those isolated nodes are and search to which nodes can be connected with the least number of intermediary nodes.
for i in range(1, n_clusters):
print('Genes in component {}:'.format(i))
print(query_set_network.vs[query_set_clusters[i]]['label'])
# look for shortest path between all pair of nodes
node_list = set()
query_nodes = [i[0] for i in prot]
distance = pd.DataFrame(np.nan, index=query_nodes, columns=query_nodes)
for igene1, iprot1 in prot:
for igene2, iprot2 in prot:
if igene1 == igene2:
distance.loc[igene1, igene2] = 0
else:
path = pa.graph.get_shortest_paths(pa.genesymbol(igene1)['name'], to=pa.genesymbol(igene2)['name'])[0]
#node_list = node_list.union(set(path))
node_list.update(path)
distance.loc[igene1, igene2] = len(path)-1 if len(path)>0 else np.nan
n_allowed_steps = 2
for i in range(1, n_clusters):
for j in query_set_network.vs[query_set_clusters[i]]['label']:
print('{} can be connected in {} steps to:'.format(j, n_allowed_steps))
print('\t'+'\n\t'.join([k for k in distance.loc[j, distance.loc[j,:]==n_allowed_steps].index]))
We can see that allowing only one intermediary node we can reach many nodes in the big connected component. Before continuing, we will repeat the process but with a directed version of the network.
pa.get_directed()
query_set_d = set()
for igene, iprot in prot:
query_set_d.add(pa.dgs(igene))
query_set_dnetwork = pa.dgraph.induced_subgraph(query_set_d)
layout = query_set_dnetwork.layout_fruchterman_reingold(repulserad = query_set_dnetwork.vcount() ** 2.8,
maxiter = 1000, area = query_set_dnetwork.vcount() ** 2.3)
plot2 = igraph.plot(query_set_dnetwork, layout = layout, vertex_size = 0.5,
vertex_border_width = 0, edge_width = 0.2, edge_color = '#33333377', vertex_label_size = 9)
plot2.save('Lescarbeu_et_al_2014_set_dnetwork.png')
display(Image('Lescarbeu_et_al_2014_set_dnetwork.png'))
print('Number of edges: {}'.format(query_set_dnetwork.ecount()))
print('Number of nodes: {}'.format(query_set_dnetwork.vcount()))
The network is still manageable, but we can observe that now IL-6 and IL6R are disconnected from the rest of the network.
# extract connected components using igraph's clusters() function
# here mode='weak' instead of the default value 'strong'
query_set_dclusters = query_set_dnetwork.clusters(mode='weak')
n_dclusters = len(query_set_dclusters)
print('Number of connected components: {}'.format(n_dclusters))
for i in xrange(n_dclusters):
print('\tComponent {} size: {}'.format(i, len(query_set_dclusters[i])))
for i in range(1, n_dclusters):
print('Genes in component {}:'.format(i))
print(query_set_dnetwork.vs[query_set_dclusters[i]]['label'])
IL6R was connected to JAK1 in the undirected network. The reason for not being connected in the directed network is that when getting a directed network, by default, only edges with an explicit reference to its directionality are kept.
We can add this link manually to our network, as we are trying to get a single network that connects all the nodes and we have a reference for it (although without explicit directionality).
# add directed edge from IL6R to JAK1
label_source = 'IL6R'
label_target = 'JAK1'
iprot_source = query_set_dnetwork.vs.find(label=label_source).index
iprot_target = query_set_dnetwork.vs.find(label=label_target).index
query_set_dnetwork.add_edge(iprot_source, iprot_target)
# copy attribute values present in the undirected network
original_attributes = pa.graph.es(pa.get_edge([pa.gs(label_source), pa.gs(label_target)]))[0].attributes()
new_edge_id = query_set_dnetwork.get_eid(iprot_source, iprot_target)
new_edge_pointer = query_set_dnetwork.es(new_edge_id)[0]
for (ikey, ivalue) in original_attributes.iteritems():
new_edge_pointer[ikey] = ivalue
We can check again the number of connected components:
# extract connected components using igraph's clusters() function
# here mode='weak' instead of the default value 'strong'
query_set_dclusters = query_set_dnetwork.clusters(mode='weak')
n_dclusters = len(query_set_dclusters)
print('Number of connected components: {}'.format(n_dclusters))
for i in xrange(n_dclusters):
print('\tComponent {} size: {}'.format(i, len(query_set_dclusters[i])))
As before, we will seek how to connect the isolated nodes to the rest of the network (but using only directed edges).
# look for shortest path between all pair of nodes
dnode_list = set()
query_nodes = [i[0] for i in prot]
ddistance = pd.DataFrame(np.nan, index=query_nodes, columns=query_nodes)
for igene1, iprot1 in prot:
for igene2, iprot2 in prot:
if igene1 == igene2:
ddistance.loc[igene1, igene2] = 0
else:
path = pa.dgraph.get_shortest_paths(pa.genesymbol(igene1)['name'], to=pa.genesymbol(igene2)['name'])[0]
#node_list = node_list.union(set(path))
dnode_list.update(path)
ddistance.loc[igene1, igene2] = len(path)-1 if len(path)>0 else np.nan
n_allowed_steps = 2
for i in range(1, n_dclusters):
for j in query_set_dnetwork.vs[query_set_dclusters[i]]['label']:
print('{} can be connected in {} steps to:'.format(j, n_allowed_steps))
print('\t'+'\n\t'.join([k for k in distance.loc[j, distance.loc[j,:]==n_allowed_steps].index]))
print('{} can be connected in {} steps from:'.format(j, n_allowed_steps))
print('\t'+'\n\t'.join([k for k in distance.loc[distance.loc[:,j]==n_allowed_steps,j].index]))
print('---')
As we can see, the isolated nodes can be connected to many other nodes in the connected components using one additional intermediary node not present in the queried list of nodes.
We can decide to include all this possible edges or only some of them based on the diagram of Figure 1. For example, $\beta$-Tubulin (TUBB) is not connected to other nodes in the diagram. Also, JNK3 (MAPK10) may already be considered to be represented by JNK1 and JNK2 (MAPK8 and MAPK9 respectively). On the other hand, p38 (MAPK13) is connected to Rac (RAC1) and HSP27 (HSPB1) in that diagram.
We can check one of the shortest paths that connects RAC1 to MAPK13 and another that connects MAPK13 to HSPB1.
i_label_source = 'RAC1'
i_label_target = 'MAPK13'
path = pa.dgraph.get_shortest_paths(pa.dgenesymbol(i_label_source)['name'], to=pa.dgenesymbol(i_label_target)['name'])[0]
print('\t' + ' --> '.join(pa.dgraph.vs[i]['label'] for i in path))
i_label_source = 'MAPK13'
i_label_target = 'HSPB1'
path = pa.dgraph.get_shortest_paths(pa.dgenesymbol(i_label_source)['name'], to=pa.dgenesymbol(i_label_target)['name'])[0]
print('\t' + ' --> '.join(pa.dgraph.vs[i]['label'] for i in path))
As we can see, to go from RAC1 to MAPK13 we need at least 3 steps in the directed network. We may decide, however, to add this connection nonetheless. In other words, we can decide to complete the network with the diagram in Figure 1 in mind. If the small network obtained this way does not explain the observations we can go back and consider other alternative wiring. We will leave TUBB and MAPK10 isolated for the moment too.
pkn_network = query_set_dnetwork.copy()
# add edge between RAC1 and MAPK13
label_source = 'RAC1'
label_target = 'MAPK13'
iprot_source = pkn_network.vs.find(label=label_source).index
iprot_target = pkn_network.vs.find(label=label_target).index
pkn_network.add_edge(iprot_source, iprot_target)
# add edge between MAPK13 and HSPB1
label_source = 'MAPK13'
label_target = 'HSPB1'
iprot_source = pkn_network.vs.find(label=label_source).index
iprot_target = pkn_network.vs.find(label=label_target).index
pkn_network.add_edge(iprot_source, iprot_target)
# extract connected components using igraph's clusters() function
# here mode='weak' instead of the default value 'strong'
pkn_network_clusters = pkn_network.clusters(mode='weak')
n_pkn_network_clusters = len(pkn_network_clusters)
print('Number of connected components: {}'.format(n_pkn_network_clusters))
for i in xrange(n_pkn_network_clusters):
print('\tComponent {} size: {}'.format(i, len(pkn_network_clusters[i])))
The edges of the network may also contain information on wether the interaction is of inhibition or not. We can color the edges red for those that are documented as inhibition.
layout = pkn_network.layout_fruchterman_reingold(repulserad = pkn_network.vcount() ** 2.8,
maxiter = 1000, area = pkn_network.vcount() ** 2.3)
edge_color = ['#FF333377' if ((e['dirs'] is not None) and e['dirs'].is_inhibition()) else '#33333377' for e in pkn_network.es]
plot2 = igraph.plot(pkn_network, layout = layout, vertex_size = 0.5,
vertex_border_width = 0, edge_width = 0.2, edge_color = edge_color, vertex_label_size = 9)
plot2.save('Lescarbeau_et_al_2014_pkn.png')
display(Image('Lescarbeau_et_al_2014_pkn.png'))
print('Number of edges: {}'.format(pkn_network.ecount()))
print('Number of nodes: {}'.format(pkn_network.vcount()))
Note however, that some of these edges with references pointing them to being inhibitory interactions may also have other references pointing to the contrary.
print('Interactions that have support for both, inhibition and stimulation:')
for i_edge in pkn_network.es:
if (i_edge['dirs'] is not None) and i_edge['dirs'].is_inhibition() and i_edge['dirs'].is_stimulation():
print('\t{} --> {}'.format(pkn_network.vs[i_edge.source]['label'], pkn_network.vs[i_edge.target]['label']))
We can print the network in sif format. This format describes a tab delimited text file with a line for each directed interaction and three elements per line:
with open('Lescarbeu_et_al_2014_pkn.txt', 'wt') as f:
for i_edge in pkn_network.es:
label_source = pkn_network.vs[i_edge.source]['label']
label_target = pkn_network.vs[i_edge.target]['label']
if (i_edge['dirs'] is not None) and i_edge['dirs'].is_inhibition():
f.write('{}\t{}\t{}\n'.format(label_source, -1, label_target))
# check also if the edge is reported as activation
if i_edge['dirs'].is_stimulation():
f.write('{}\t{}\t{}\n'.format(label_source, 1, label_target))
else:
f.write('{}\t{}\t{}\n'.format(label_source, 1, label_target))
We can also count the number of references that support each interaction.
# count the number of references for each edge and store it in a new edge attribute
pkn_network.es['nrefs'] = [len(edge['references']) if edge['references'] is not None else 0 for edge in pkn_network.es ]
One of the possible uses of a prior knowledge network is the simulation of a signaling network using boolean modeling techniques. We can use CellNOpt (http://www.cellnopt.org) for this task. This software will allow us to find a network that best explains our data starting from a prior knowledge network. In the final model, only necessary edges will be included. However, we may need to start with a more refined initial network. Specifically, it is recommended that the indegree of the nodes is limited to help in the optimization process (this limit may vary depending on the application). We can always come back later and include alternative edges if the model we obtain does not fit the data appropriately.
# Add a new attribute to each vertex with its indegree
pkn_network.vs['indegree'] = [i for i in pkn_network.indegree()]
indegree_hist = sns.plt.hist(pkn_network.indegree(), 100)
sns.plt.title('Indegree distribution')
In this case, we would like to limit the maximum indegree to 5. For achieving this, we need to discard some edges for those nodes with an indegree higher than 5. To choose the edges to be discarded, one option is to check the number of references supporting each edge and discard those with the lowest support.
max_indegree = 5
ids_edges_to_remove = []
for i_vertex in pkn_network.vs:
if i_vertex['indegree']>max_indegree:
edges_in = pkn_network.es.select(_target=i_vertex.index)
print(i_vertex['label'])
print("nrefs: " + ", ".join([str(i) for i in edges_in['nrefs']]))
#print("degree of source: " + ", ".join([str(pkn_network.vs[i.source].degree()) for i in edges_in]))
# get the threshold of nrefs that leaves max_indegree edges or less
nrefs_list = np.array([e['nrefs'] for e in edges_in])
nrefs_list.sort()
nrefs_threshold = nrefs_list[-(max_indegree+1)]
# get indices of edges to be removed
ids_edges_to_remove.extend([e.index for e in edges_in.select(nrefs_le=nrefs_threshold)])
print("Suggested deletions: ")
for i_edge_to_remove in edges_in.select(nrefs_le=nrefs_threshold):
i_source_label = pkn_network.vs[i_edge_to_remove.source]['label']
i_target_label = pkn_network.vs[i_edge_to_remove.target]['label']
print('\t' + ' --> '.join([i_source_label, i_target_label]) + "\t({} refs)".format(i_edge_to_remove["nrefs"]))
print('---')
We should take into account that removing edges may have undesired consequences, such as disconnecting a node from the rest of the network. If this happens, we might decide not to delete some of these edges.
pkn_network_v2 = pkn_network.copy()
pkn_network_v2.delete_edges(ids_edges_to_remove)
# extract connected components using igraph's clusters() function
# here mode='weak' instead of the default value 'strong'
pkn_network_v2_clusters = pkn_network_v2.clusters(mode='weak')
n_pkn_network_v2_clusters = len(pkn_network_v2_clusters)
print('Number of connected components: {}'.format(n_pkn_network_v2_clusters))
for i in xrange(n_pkn_network_v2_clusters):
print('\tComponent {} size: {}'.format(i, len(pkn_network_v2_clusters[i])))
In this case, we have no new disconnected nodes.
layout = pkn_network.layout_fruchterman_reingold(repulserad = pkn_network.vcount() ** 2.8,
maxiter = 1000, area = pkn_network.vcount() ** 2.3)
edge_color = ['#FF333377' if ((e['dirs'] is not None) and e['dirs'].is_inhibition()) else '#33333377' for e in pkn_network.es]
plot2 = igraph.plot(pkn_network, layout = layout, vertex_size = 0.5,
vertex_border_width = 0, edge_width = 0.2, edge_color = edge_color, vertex_label_size = 9)
plot2.save('Lescarbeau_et_al_2014_pkn_v2.png')
display(Image('Lescarbeau_et_al_2014_pkn_v2.png'))
print('Number of edges: {}'.format(pkn_network_v2.ecount()))
print('Number of nodes: {}'.format(pkn_network_v2.vcount()))
with open('Lescarbeau_et_al_2014_pkn_v2.txt', 'wt') as f:
for i_edge in pkn_network.es:
label_source = pkn_network.vs[i_edge.source]['label']
label_target = pkn_network.vs[i_edge.target]['label']
if (i_edge['dirs'] is not None) and i_edge['dirs'].is_inhibition():
f.write('{}\t{}\t{}\n'.format(label_source, -1, label_target))
# check also if the edge is reported as activation
if i_edge['dirs'].is_stimulation():
f.write('{}\t{}\t{}\n'.format(label_source, 1, label_target))
else:
f.write('{}\t{}\t{}\n'.format(label_source, 1, label_target))
Lastly, we will add manually some links to the sif file related to the treatments that appear in Figure 1.
with open('Lescarbeau_et_al_2014_pkn_v2.txt', 'at') as f:
f.write('{}\t{}\t{}\n'.format("DHT", 1, "AR"))
f.write('{}\t{}\t{}\n'.format("DHT", 1, "AR"))
f.write('{}\t{}\t{}\n'.format("Docetaxel", 1, "TUBB"))
f.write('{}\t{}\t{}\n'.format("Docetaxel", 1, "TUBB"))
f.write('{}\t{}\t{}\n'.format("Stress", 1, "RAC1"))
f.write('{}\t{}\t{}\n'.format("Stress", 1, "RAC1"))