This notebook shows how to locate Transcription Factors (TFs) in pypath.
# Show all the plots inside the notebook
%matplotlib inline
# load packages
import pypath
import igraph # import igraph to use the plot function
import numpy as np
import pandas as pd
import seaborn as sns
pa = pypath.PyPath()
pa.init_network()
We will use GO annotations to locate TFs.
# load go annotations:
pa.load_go()
# get the GO annotation:
pa.go_dict()
# get also the directed network
pa.get_directed()
#pa.ugraph = pa.graph
#pa.graph = pa.dgraph
# list names instead of IDs:
# (9606 is an NCBI taxonomy ID)
map(pa.go[9606].get_name, set(pa.gs('GATA1')['go']['C']))
tf = pa.dgraph.vs.select(lambda vertex: pa.go[9606].get_term('transcription factor complex') in vertex['go']['C'])
tfr = pa.dgraph.vs.select(lambda vertex: pa.go[9606].get_term('transcriptional repressor complex') in vertex['go']['C'])
print('Number of nodes annotated as \'transcription factor complex\': {}'.format(len(tf)))
print('Number of nodes annotated as \'transcriptional repressor complex\': {}'.format(len(tfr)))
# Note: some nodes may be annotated with both GO terms
print('Number of nodes annotated with any of the two terms above: {}'.format(len(set(tf['label']+tfr['label']))))
We can also look for nodes annotated with several GO terms. For example, we can try to locate all the nodes corresponding to cell membrane proteins located in its surface.
filter_func = lambda vertex: pa.go[9606].get_term('cell surface') in vertex['go']['C'] and pa.go[9606].get_term('plasma membrane') in vertex['go']['C']
pm = pa.dgraph.vs.select(filter_func)
print('Number of nodes annotated with \'cell surface\' and \'plasma membrane\': {}'.format(len(pm['label'])))
Locate nodes with no inputs or no outputs. Also, check that there are no isolated nodes.
only_in = pa.dgraph.vs.select(lambda vertex: vertex.outdegree()==0)
only_out = pa.dgraph.vs.select(lambda vertex: vertex.indegree()==0)
isolated = pa.graph.vs.select(lambda vertex: vertex.degree()==0)
print('Number of nodes with no output arcs: {}'.format(len(only_in)))
print('Number of nodes with no input arcs: {}'.format(len(only_out)))
print('Number of nodes with no arcs: {}'.format(len(isolated)))
dnode_list = set()
rows = pm['label']
cols = tf['label'] + tfr['label']
ddistance = pd.DataFrame(np.nan, index=rows, columns=cols)
for igene1 in rows:
for igene2 in cols:
path = pa.dgraph.get_shortest_paths(pa.dgenesymbol(igene1)['name'], to=pa.dgenesymbol(igene2)['name'])[0]
dnode_list.update(path)
ddistance.loc[igene1, igene2] = len(path)-1 if len(path)>0 else np.nan
interconnection_dgraph = pa.dgraph.induced_subgraph(dnode_list)
# for directed graphs with many edges, plotting the network may be prohibitive
#igraph.plot(interconnection_dgraph, layout=interconnection_dgraph.layout_auto(), vertex_label=None)
sns.plt.hist(interconnection_dgraph.degree(), bins=100)
sns.plt.plot(ddistance.as_matrix().ravel(), '.')
tmp = ddistance.as_matrix().ravel()
sns.plt.hist(tmp[~np.isnan(tmp)], bins=100)
pa.set_transcription_factors()
pa_tf = pa.transcription_factors()
pa_tf = pa.graph.vs.select(lambda vertex: vertex['tf'] is True)
len(pa_tf)
pa.set_receptors()
pa_rec = pa.graph.vs.select(lambda vertex: vertex['rec'] is True)
len(pa_rec)
pypath.data_formats.urls['hpmr']['url']
tf_by_go = list(tf['label'] + tfr['label'])
tf_by_pa = list(pa_tf['label'])
tf_shared = set(tf_by_pa).intersection(tf_by_go)
len(tmp3)
rec_by_go = list(pm['label'])
rec_by_pa = list(pa_rec['label'])
rec_shared = set(rec_by_pa).intersection(rec_by_go)
len(rec_shared)