# Mk multi regime models
from __future__ import absolute_import, division, print_function, unicode_literals
import ivy
import math
import random
import itertools
import types
from math import ceil
import line_profiler
import numpy as np
import scipy
import pymc
from scipy import special
from scipy.optimize import minimize
from scipy.special import binom
from ivy.chars.expokit import cyexpokit
try:
StringTypes = types.StringTypes # Python 2
except AttributeError: # Python 3
StringTypes = [str]
IDEG = 6 # Constant used in Cython/Fortran code.
[docs]def mk_mr_midbranch(tree, chars, Qs, switchpoint, pi="Equal", returnPi=False,
ar = None, debug=False):
"""
Calculate likelhiood of mk model with BAMM-like multiple regimes
Args:
tree (Node): Root node of a tree. All branch lengths must be
greater than 0 (except root)
chars (dict): Dict mapping character states to tip labels.
Character states should be coded 0,1,2...
Can also be a list with tip states in preorder sequence
Qs (np.array): Array of instantaneous rate matrices
locs (np.array): Array of the same length as Qs containing the
node indices that correspond to each Q matrix
p (np.array): Optional pre-allocated p matrix
pi (str or np.array): Option to weight the root node by given values.
Either a string containing the method or an array
of weights. Weights should be given in order.
Accepted methods of weighting root:
Equal: flat prior
Equilibrium: Prior equal to stationary distribution
of Q matrix
Fitzjohn: Root states weighted by how well they
explain the data at the tips.
Returns:
(float): Log-likelihood of tree/characters given the Q matrices.
"""
####################
# Step 1: setting up
####################
# These lines of code ensure that the object switchpoint is of the correct
# type even when passed in from PYMC
if len(switchpoint)>0:
if type(switchpoint[0]) == pymc.PyMCObjects.Stochastic:
switchpoint = [i.value for i in switchpoint]
# chars can be passed in as a dict. If it is, convert it to a list of ints in preorder sequence.
if type(chars) == dict:
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
# Number of character states
nchar = len(set(chars))
# Creation of preallocated arrays.
if ar is None:
ar = create_mkmr_mb_ar(tree,chars,nregime=Qs.shape[0],findmin=True)
# chars is going to need to be reordered.
chars = ar["chars"]
# Get the actual nodes from the switchpoint tuple
switchpoint_nodes = [ar["tree_copy"][switchpoint[i][0].id] for i in range(len(switchpoint))]
# Reset t values
ar["t"] = ar["blens"][:] + 1e-40 # Branch lengths can't be exactly zero.
# Adjust t to represent the length of each switchpoint
for s in switchpoint:
sw = ar["tree_copy"][s[0].id].pi #Switchpoint
sk = ar["tree_copy"][s[0].id].parent.pi #Switchpoint's parent
ar["t"][sw] = s[1]
ar["t"][sk] = ar["tree_copy"][s[0].id].parent.length - s[1]
# inds corresponds to the tree in postorder sequence, with each value
# corresponding to the regime that node is in. It is passed to dexpm3
switches_ni = [ar["tree_copy"][s[0].id].ni for s in switchpoint] + [0]
cyexpokit.inds_from_switchpoint_p(np.array(switches_ni), ar["cladesize_preorder"],
ar["clades_postorder_preorder"],
ar["inds"])
(Qs != ar["prev_Q"]).any(axis=(1,2), out=ar["Qdif"])
#####################
# Step 2: create "pmask"
#####################
# pmask is a mask array that keeps track of which p-matrices need to be
# recalculated and which ones can use the values from the previous call.
#Which Qs are different? Different Qs mean we have to recalculate p for that index
np.logical_or.reduce((ar["prev_inds"]!=ar["inds"], ar["t"]!=ar["prev_t"],ar["Qdif"][ar["inds"]]),out=ar["pmask"]) # Which p-matrices do we need to recalcualte?
##################
# Step 3: calculate p matrices
##################
# Creating probability matrices from Q matrices and branch lengths
# inds indicates which Q matrix to use for which branch
# np.exp(ar["p"], out=ar["p"]) # Exponentiate p matrices
Qs += 1e-40
cyexpokit.lndexpm3(Qs,ar["t"],np.array(ar["inds"]),ar["p"],ideg=IDEG,wsp=ar["wsp"],pmask=ar["pmask"].astype(int)) # Calculating the actual p matrix
# np.log(ar["p"], out=ar["p"]) # Log p matrices
#################
# Step 4: calculate likelihood
#################
# Calculating the likelihoods for each node in post-order sequence
np.copyto(ar["nodelist"], ar["nodelistOrig"]) # Resetting the starting values of nodelist.
cyexpokit.cy_mk_log(ar["nodelist"], ar["p"], nchar, ar["tmp_ar"],ar["intnode_list"],ar["child_ar"]) # Performing the likelihood calculation
# cyexpokit.cy_mk_log(ar["nodelist"], ar["p"], nchar, ar["tmp_ar"],ar["intnode_list"],ar["child_ar"])
################
# Step 5: apply root prior
################
# The last row of nodelist contains the likelihood values at the root
# Applying the correct root prior
if not type(pi) in StringTypes:
assert len(pi) == nchar, "length of given pi does not match Q dimensions"
assert str(type(pi)) == "<type 'numpy.ndarray'>", "pi must be str or numpy array"
assert np.isclose(sum(pi), 1), "values of given pi must sum to 1"
np.copyto(ar["root_priors"], pi)
rootliks = ([ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ])
elif pi == "Equal":
ar["root_priors"].fill(1.0/nchar)
rootliks = [ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1])]
elif pi == "Fitzjohn":
np.copyto(ar["root_priors"],
[ar["nodelist"][-1,:-1][charstate]-
scipy.misc.logsumexp(ar["nodelist"][-1,:-1]) for
charstate in set(chars) ])
rootliks = [ ar["nodelist"][-1,:-1][charstate] +
ar["root_priors"][charstate] for charstate in set(chars) ]
elif pi == "Equilibrium":
# Equilibrium pi from the stationary distribution of Q
np.copyto(ar["root_priors"],qsd(Q))
rootliks = [ i + np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ]
logli = scipy.misc.logsumexp(rootliks)
###############
# Cleanup and storing values for next call to this function
##############
ar["prev_t"][:] = ar["t"][:]
ar["prev_inds"][:] = ar["inds"][:]
ar["prev_Q"][:] = Qs[:]
if returnPi:
return (logli, {k:v for k,v in enumerate(ar["root_priors"])})
else:
return logli
[docs]def create_mkmr_ar(tree, chars,nregime,findmin = True):
"""
Create preallocated arrays. For use in mk function
Nodelist = edgelist of nodes in postorder sequence
"""
if type(chars) == dict:
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
t = np.array([node.length for node in tree.postiter() if not node.isroot], dtype=np.double) # t is in POSTORDER SEQUENCE
nt = len(tree.descendants())
nchar = len(set(chars))
preleaves = [ n for n in tree.preiter() if n.isleaf ]
postleaves = [n for n in tree.postiter() if n.isleaf ]
postnodes = list(tree.postiter())
postChars = [ chars[i] for i in [ preleaves.index(n) for n in postleaves ] ]
nnode = len(t)+1
nodelist = np.zeros((nnode, nchar+1))
nodelist.fill(-np.inf) # the log of 0 is negative infinity
leafind = [ n.isleaf for n in tree.postiter()]
for k,ch in enumerate(postChars):
[ n for i,n in enumerate(nodelist) if leafind[i] ][k][ch] = np.log(1.0)
for i,n in enumerate(nodelist[:-1]):
n[nchar] = postnodes.index(postnodes[i].parent)
# Setting initial node likelihoods to log one for calculations
nodelist[[ i for i,b in enumerate(leafind) if not b],:-1] = np.log(1.0)
# Empty Q matrix
Q = np.zeros([nregime,nchar, nchar], dtype=np.double)
# Empty p matrix
p = np.empty([nt, nchar, nchar], dtype = np.double, order="C")
nodelistOrig = nodelist.copy()
rootpriors = np.empty([nchar], dtype=np.double)
if findmin:
nullval = np.inf
else:
nullval = -np.inf
treelen = sum([ n.length for n in tree.leaves()[0].rootpath() if n.length]+[
tree.leaves()[0].length])
upperbound = len(tree.leaves())/treelen
charlist = list(range(nchar))
tmp_ar = np.zeros(nchar) # Used for storing calculations
wsp = np.empty(4*nchar*nchar+IDEG+1)
var = {"Q": Q, "p": p, "t":t, "nodelist":nodelist, "charlist":charlist,
"nodelistOrig":nodelistOrig, "upperbound":upperbound,
"root_priors":rootpriors, "nullval":nullval, "tmp_ar":tmp_ar}
return var
[docs]def create_mkmr_mb_ar(tree, chars,nregime,findmin = True):
"""
Preallocated arrays for midbranch mk_mr
Create preallocated arrays. For use in mk_mr midbranch function
Nodelist = edgelist of nodes in postorder sequence
"""
if type(chars) != dict:
chars = {tree.leaves()[i].label:v for i,v in enumerate(chars)}
tree_copy_ = tree.copy()
for n in tree_copy_.descendants():
n.meta["cached"]=False
n.bisect_branch(1e-15, reindex=False)
# Here we break up the tree so that each node has a "knee"
# for a parent. This knee starts with a length of 0, effectively
# making it nonexistant, but can have its length changed
# to act as a switchpoint.
tree_copy_str = tree_copy_.write()
tree_copy = ivy.tree.read(tree_copy_str)
for n1,n2 in zip(tree_copy_.preiter(), tree_copy.preiter()):
n2.id = n1.id
for n in tree_copy:
n.cladesize = len(n)
chars = [chars[l] for l in [n.label for n in tree_copy.leaves()]]
blens = np.array([node.length for node in tree_copy.postiter() if not node.isroot])
t = np.array(blens, dtype=np.double)
prev_t = t.copy()
prev_t.fill(np.inf)
nt = len(t)
nchar = len(set(chars))
preleaves = [ n for n in tree_copy.preiter() if n.isleaf ]
postleaves = [n for n in tree_copy.postiter() if n.isleaf ]
postnodes = list(tree_copy.postiter())
edgelist = [-np.inf] * len(tree_copy)
for i in range(len(edgelist)-1):
edgelist[i] = postnodes[i].parent.pi
postChars = [ chars[i] for i in [ preleaves.index(n) for n in postleaves ] ]
nnode = len(t)+1
nodelist = np.zeros((nnode, nchar+1))
nodelist.fill(-np.inf) # the log of 0 is negative infinity
leafind = [ n.isleaf for n in tree_copy.postiter()]
for k,ch in enumerate(postChars):
[ n for i,n in enumerate(nodelist) if leafind[i] ][k][ch] = np.log(1.0)
nodelist[:,-1] = edgelist
# Setting initial node likelihoods to log one for calculations
nodelist[[ i for i,b in enumerate(leafind) if not b],:-1] = np.log(1.0)
# Empty Q matrix
Q = np.zeros([nregime,nchar, nchar], dtype=np.double)
prev_Q = Q.copy()
prev_Q.fill(np.inf)
# Empty p matrix
p = np.empty([nt, nchar, nchar], dtype = np.double, order="C")
pmask = np.array([True]*len(t))
nodelistOrig = nodelist.copy()
rootpriors = np.empty([nchar], dtype=np.double)
if findmin:
nullval = np.inf
else:
nullval = -np.inf
treelen = sum([ n.length for n in tree_copy.leaves()[0].rootpath() if n.length]+[
tree_copy.leaves()[0].length])
upperbound = len(tree_copy.leaves())/treelen
charlist = list(range(nchar))
tmp_ar = np.zeros(nchar) # Used for storing calculations
pre_post = np.array([n.pi for n in tree_copy]) # The postorder index that corresponds to each preorder index
prev_inds = np.zeros(len(t), dtype=int) #Keeping track of locations from previous call
Qdif = np.ones([nregime], dtype=bool) # Array for keeping track of changes to Q
locs = np.zeros(nregime, dtype=object)
inds = np.zeros(nt, dtype=int)
max_children = max(len(n.children) for n in tree_copy)
child_ar = np.empty([tree_copy.cladesize,max_children], dtype=np.int64)
child_ar.fill(-1)
intnode_list = np.array(sorted(set(nodelist[:-1,nchar])), dtype=int)
for intnode in intnode_list:
children = np.where(nodelist[:,nchar]==intnode)[0]
child_ar[int(intnode)][:len(children)] = children
wsp = np.empty(4*nchar*nchar+IDEG+1)
cladesize_preorder = np.array([len(n) for n in tree_copy])
clades_postorder_preorder = np.zeros([len(tree_copy),len(tree_copy)])
clades_postorder_preorder -= 1
for ni,node in enumerate(tree_copy):
clades_postorder_preorder[ni][:cladesize_preorder[ni]] = [x.pi for x in node]
var = {"Q": Q, "p": p, "t":t, "nodelist":nodelist, "charlist":charlist,
"nodelistOrig":nodelistOrig, "upperbound":upperbound,
"root_priors":rootpriors, "nullval":nullval, "tmp_ar":tmp_ar,
"tree_copy":tree_copy,"chars":chars,"pre_post":pre_post,"prev_Q":prev_Q,
"prev_t":prev_t, "blens":blens,"pmask":pmask,
"prev_inds":prev_inds, "locs":locs,"Qdif":Qdif,
"intnode_list":intnode_list,"child_ar":child_ar,"wsp":wsp,
"inds":inds,"cladesize_preorder":cladesize_preorder,
"clades_postorder_preorder":clades_postorder_preorder}
return var
[docs]def create_likelihood_function_multimk(tree, chars, Qtype, nregime, pi="Equal",
findmin = True):
if type(chars) == dict:
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
if findmin:
nullval = np.inf
else:
nullval = -np.inf
nchar = len(set(chars))
nt = len(tree.descendants())
charlist = list(range(nchar))
# Number of parameters per Q matrix
n_qp = nchar**2-nchar
# Empty Q matrix
Q = np.zeros([nregime,nchar,nchar], dtype=np.double)
# Empty p matrix
p = np.empty([nt, nchar, nchar], dtype = np.double, order="C")
# Empty likelihood array
var = create_mkmr_ar(tree, chars,nregime,findmin)
def likelihood_function(Qparams, locs):
# Enforcing upper bound on parameters
Qparams = [float(qp) for qp in Qparams]
# # TODO: replace with sum of each Q
if (np.sum(Qparams)/len(locs) > var["upperbound"]) or any([x<=0 for x in Qparams]):
return var["nullval"]
# if not (Qparams == sorted(Qparams)):
# return var["nullval"]
# Filling Q matrices:
Qparams = [Qparams[i:i+n_qp] for i in range(nregime)]
if Qtype == "ER":
for i,qmat in enumerate(var["Q"]):
qmat.fill(float(Qparams[i]))
qmat[np.diag_indices(nchar)] = -Qparams[i] * (nchar-1)
elif Qtype == "ARD":
for i,qmat in enumerate(var["Q"]):
qmat.fill(0.0) # Re-filling with zeroes
qmat[np.triu_indices(nchar, k=1)] = Qparams[i][:len(Qparams[i])/2]
qmat[np.tril_indices(nchar, k=-1)] = Qparams[i][len(Qparams[i])/2:]
qmat[np.diag_indices(nchar)] = 0-np.sum(qmat, 1)
else:
raise ValueError("Qtype must be one of: ER, Sym, ARD")
# Resetting the values in these arrays
np.copyto(var["nodelist"], var["nodelistOrig"])
var["root_priors"].fill(1.0)
if findmin:
x = -1
else:
x = 1
try:
return x * mk_mr(tree, chars, var["Q"], locs, pi = pi, ar=var) # Minimizing negative log-likelihood
except ValueError: # If likelihood returned is 0
return var["nullval"]
return likelihood_function
[docs]def lf_mk_mr_midbranch(tree, chars, Qtype, nregime, pi="Equal",
findmin = True):
if type(chars) == dict:
chardict = chars
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
else:
chardict = {tree.leaves()[i].label:v for i,v in enumerate(chars)}
nchar = len(set(chars))
# Empty likelihood array
var = create_mkmr_mb_ar(tree, chardict,nregime,findmin)
# Number of parameters per Q matrix
n_qp = nchar**2-nchar
def likelihood_function(Qparams, switchpoint):
# Enforcing upper bound on parameters
Qparams = [float(qp) for qp in Qparams]
# # TODO: replace with sum of each Q
if (np.sum(Qparams)/nregime > var["upperbound"]) or any([x<=0 for x in Qparams]):
return var["nullval"]
# if not (Qparams == sorted(Qparams)):
# return var["nullval"]
# Filling Q matrices:
Qparams = [Qparams[i*n_qp:i*n_qp+n_qp] for i in range(nregime)]
if Qtype == "ER":
for i,qmat in enumerate(var["Q"]):
qmat.fill(float(Qparams[i]))
qmat[np.diag_indices(nchar)] = -Qparams[i] * (nchar-1)
elif Qtype == "ARD":
for i,qmat in enumerate(var["Q"]):
qmat.fill(0.0) # Re-filling with zeroes
qmat[np.triu_indices(nchar, k=1)] = Qparams[i][:len(Qparams[i])//2]
qmat[np.tril_indices(nchar, k=-1)] = Qparams[i][len(Qparams[i])//2:]
qmat[np.diag_indices(nchar)] = 0-np.sum(qmat, 1)
else:
raise ValueError("Qtype must be one of: ER, Sym, ARD")
# Resetting the values in these arrays
np.copyto(var["nodelist"], var["nodelistOrig"])
var["root_priors"].fill(1.0)
if findmin:
x = -1
else:
x = 1
try:
return x * mk_mr_midbranch(tree, chars, var["Q"], switchpoint, pi = pi, ar=var) # Minimizing negative log-likelihood
except ValueError: # If likelihood returned is 0
return var["nullval"]
return likelihood_function
[docs]def create_likelihood_function_multimk_mods(tree, chars, mods, pi="Equal",
findmin = True, orderedparams=True):
"""
Create a likelihood function for testing the parameter values of user-
specified models
"""
if type(chars) == dict:
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
if findmin:
nullval = np.inf
else:
nullval = -np.inf
nregime = len(mods)
nchar = len(set(chars))
nt = len(tree.descendants())
charlist = list(range(nchar))
nparam = len(set([i for s in mods for i in s]))
# Empty likelihood array
var = create_mkmr_ar(tree, chars,nregime,findmin)
def likelihood_function(Qparams, locs):
# Enforcing upper bound on parameters
Qparams = np.insert(Qparams, 0, 1e-15)
# # TODO: replace with sum of each Q
if (np.sum(Qparams)/len(locs) > var["upperbound"]) or any([x<=0 for x in Qparams]):
return var["nullval"]
if orderedparams:
if any(Qparams != sorted(Qparams)):
return var["nullval"]
# Clearing Q matrices
var["Q"].fill(0.0)
# Filling Q matrices:
fill_model_mr_Q(Qparams, mods, var["Q"])
# Resetting the values in these arrays
np.copyto(var["nodelist"], var["nodelistOrig"])
var["root_priors"].fill(1.0)
if findmin:
x = -1
else:
x = 1
try:
return x * mk_mr(tree, chars, var["Q"], locs, pi = pi, ar=var) # Minimizing negative log-likelihood
except ValueError: # If likelihood returned is 0
return var["nullval"]
return likelihood_function
[docs]def lf_mk_mr_midbranch_mods(tree, chars, mods, pi="Equal",
findmin = True, orderedparams=True):
"""
Create a likelihood function for testing the parameter values of user-
specified models with switchpoints allowed in the middle of branches
"""
if type(chars) == dict:
chardict = chars
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
else:
chardict = {tree.leaves()[i].label:v for i,v in enumerate(chars)}
nregime = len(mods)
nparam = len(set([i for s in mods for i in s]))
# Empty likelihood array
var = create_mkmr_mb_ar(tree, chardict,nregime,findmin)
def likelihood_function(Qparams, switchpoint):
# Enforcing upper bound on parameters
Qparams = np.insert(Qparams, 0, 1e-15)
# # TODO: replace with sum of each Q
if (np.sum(Qparams)/nregime > var["upperbound"]) or any([x<=0 for x in Qparams]):
return var["nullval"]
if orderedparams:
if any(Qparams != sorted(Qparams)):
return var["nullval"]
# Clearing Q matrices
var["Q"].fill(0.0)
# Filling Q matrices:
fill_model_mr_Q(Qparams, mods, var["Q"])
# Resetting the values in these arrays
np.copyto(var["nodelist"], var["nodelistOrig"])
var["root_priors"].fill(1.0)
if findmin:
x = -1
else:
x = 1
try:
return x * mk_mr_midbranch(tree, chars, var["Q"], switchpoint, pi = pi, ar=var) # Minimizing negative log-likelihood
except ValueError: # If likelihood returned is 0
return var["nullval"]
return likelihood_function
[docs]def fill_model_mr_Q(Qparams, mods, Q):
"""
Fill a Q-matrix with Qparams corresponding to the model.
Individual Q matrices are filled rowwise
Function alters Q in place
"""
# Filling Q matrix row-wise (skipping diagonals)
nregime = len(mods)
nchar=Q.shape[1]
for regime in range(nregime):
modcount = 0 # Which mod index are we on
for r,c in itertools.product(list(range(nchar)),repeat=2):
if r==c: # Skip diagonals
pass
else:
Q[regime,r,c] = Qparams[mods[regime][modcount]]
modcount+=1
np.fill_diagonal(Q[regime], -np.sum(Q[regime], axis=1)) # Filling diagonals
[docs]def locs_from_switchpoint(tree, switches, locs=None):
"""
Given a tree and a list of nodes to be switchpoints, return an
array of all node indices in one regime vs the other
Args:
tree (Node): Root node of tree
switches (list): List of internal nodes to act as switchpoints.
Switchpoint nodes are part of their own regimes.
locs (np.array): Optional pre-allocated array to store output.
Returns:
np.array: Array of indices of all nodes associated with each switchpoint, plus
all nodes "outside" the switches in the last list
"""
# Include the root
switches = switches + [tree]
# Sort switches by clade size
# We need the "cladesize" attribute to sort the clades. If it does not
# exist on the tree, create it
if not hasattr(tree, "cladesize"):
for n in tree:
n.cladesize = len(n)
switches_c = switches[:]
switches_c.sort(key=lambda x: x.cladesize)
if locs is None:
out = True
locs = np.zeros(len(switches_c), dtype=object)
else:
out = False
locs.fill(0) # Clear locs array
for i,s in enumerate(switches_c):
t_l = []
# Add descendants of node to location array if they are not
# already recorded. This approach works because the clade sizes
# were sorted beforehand
viewed = set([x for l in locs[:i] for x in l])
for n in tree[s]:
if not n.ni in viewed:
t_l.append(n.ni)
locs[i] = t_l
# Remove the root
locs[-1] = locs[-1][1:]
# Return locs in proper order (locations descended from each switch in order,
# with locations descended from the root last)
locs[:] =locs[[switches_c.index(i) for i in switches]][:]
if out:
return locs
[docs]def inds_from_switchpoint_cython(tree, switches_ni, ar,debug=False):
nr = len(switches_ni)-1
switches_ni_c = switches_ni[:]
switches_ni_c.sort(key=lambda x: ar["cladesize_preorder"][x])
switches_key = [switches_ni_c.index(i) for i in switches_ni]
for i,s in enumerate(switches_ni_c[::-1]):
for n in ar["clades_postorder_preorder"][s]:
if not n==len(ar["cladesize_preorder"])-1: # Ignore the root
ar["inds"][n] = switches_key[nr-i]
[docs]class ModelOrderMetropolis(pymc.Metropolis):
"""
Custom step method for model order
"""
def __init__(self, stochastic):
pymc.Metropolis.__init__(self, stochastic, scale=1., proposal_distribution="prior")
[docs] def propose(self):
cur_order = self.stochastic.value
indexes_to_swap = random.sample(range(len(cur_order)),2)
new_order = cur_order[:]
# Swap values
new_order[indexes_to_swap[0]], new_order[indexes_to_swap[1]] = new_order[indexes_to_swap[1]], new_order[indexes_to_swap[0]]
self.stochastic.value = new_order
[docs] def reject(self):
self.rejected += 1
self.stochastic.value = self.stochastic.last_value
[docs] def competence(self):
return 0
[docs]class SwitchpointMetropolis(pymc.Metropolis):
"""
Custom step algorithm for selecting a new switchpoint
"""
def __init__(self, stochastic, tree, seg_map, stepsize=0.05, seglen=0.02):
pymc.Metropolis.__init__(self, stochastic, scale=0,proposal_sd=1, proposal_distribution="prior")
root_to_tip_length = tree.max_tippath()
self.tree = tree
self.seg_map = seg_map
self.stepsize = stepsize * root_to_tip_length
self.seg_size = seglen * root_to_tip_length
[docs] def propose(self):
# Following BAMM, switchpoint movements can be either global or
# local, with global switch happening 1/10 times
if (random.choice(range(10))==0):
self.global_step()
else:
self.local_step()
[docs] def local_step(self):
prev_location = self.stochastic.value
new_location = cyexpokit.local_step(prev_location,self.stepsize,self.seg_size,self.adaptive_scale_factor)
self.stochastic.value = new_location
[docs] def global_step(self):
self.stochastic.value = cyexpokit.random_tree_location(self.seg_map)
[docs] def reject(self):
self.rejected += 1
self.stochastic.value = self.stochastic.last_value
[docs] def competence(self):
return 0
[docs]def round_step_size(step_size, seg_size):
"""
Round step_size to the nearest segment
"""
if (step_size%seg_size) > (seg_size/2):
return step_size + (seg_size-(step_size%seg_size)) + 1e-15
else:
return step_size - (step_size%seg_size) + 1e-15
[docs]def tree_map(tree, seglen=0.02):
"""
Make a map of the tree cut into segments of size (seglen*root_to_tip_length)
"""
root_to_tip_length = tree.max_tippath()
seg_size = seglen * root_to_tip_length
seg_map = []
seen = [tree]
for node in tree.descendants():
cur_len = node.length
nseg = int(ceil(cur_len/seg_size))
for n in range(nseg):
seg_map.append((node, n*seg_size+1e-15))
return np.array(seg_map,dtype=object)
[docs]def random_tree_location(seg_map):
"""
Select random location on tree with uniform probability
Returns:
tuple: node and float, which represents the how far along the branch to
the parent node the switch occurs on
"""
i = random.choice(range(len(seg_map)))
return seg_map[i]
[docs]def make_switchpoint_stoch(seg_map, name=str("switch")):
startingval = cyexpokit.random_tree_location(seg_map)
@pymc.stochastic(dtype=object, name=name)
def switchpoint_stoch(value = startingval):
# Flat prior on switchpoint location
return 0
return switchpoint_stoch
[docs]def make_modelorder_stoch(mods, name=str("modorder")):
startingval = mods
@pymc.stochastic(dtype=tuple, name=name)
def modelorder_stoch(value=startingval):
return 0
return modelorder_stoch
[docs]def mk_multi_bayes(tree, chars,nregime,qidx, pi="Equal" ,seglen=0.02,stepsize=0.05):
"""
Create a Bayesian multi-mk model. User specifies which regime models
to use and the Bayesian model finds the switchpoints.
Args:
tree (Node): Root node of tree.
chars (dict): Dict mapping tip labels to discrete character
states. Character states must be in the form of [0,1,2...]
regime (int): The number of distinct regimes to test. Set to
1 for an Mk model, set to greater than 1 for a multi-regime Mk model.
qidx (np.array): Index specifying the model to test
columns:
0, 1, 2 - index axes of q
3 - index of params
This scheme allows flexible specification of models. E.g.:
Symmetric mk2:
params = [0.2]; qidx = [[0,0,1,0],
[0,1,0,0]]
Asymmetric mk2:
params = [0.2,0.6]; qidx = [[0,0,1,0],
[0,1,0,1]]
NOTE:
The qidx corresponding to the first q matrix (first column 0)
is always the root regime
pi (str or np.array): Option to weight the root node by given values.
Either a string containing the method or an array
of weights. Weights should be given in order.
Accepted methods of weighting root:
Equal: flat prior
Equilibrium: Prior equal to stationary distribution
of Q matrix
Fitzjohn: Root states weighted by how well they
explain the data at the tips.
seglen (float): Size of segments to break tree into. The smaller this
value, the more "fine-grained" the analysis will be. Optional,
defaults to 2% of the root-to-tip length.
stepsize (float): Maximum size of steps for switchpoints to take.
Optional, defaults to 5% of root-to-tip length.
"""
if type(chars) == dict:
data = chars.copy()
chars = [chars[l] for l in [n.label for n in tree.leaves()]]
else:
data = dict(zip([n.label for n in tree.leaves()],chars))
# Preparations
nchar = len(set(chars))
nparam = len(set([n[-1] for n in qidx]))
# This model has 2 components: Q parameters and switchpoints
# They are combined in a custom likelihood function
###########################################################################
# Switchpoint:
###########################################################################
# Modeling the movement of the regime shift(s) is the tricky part
# Regime shifts will only be allowed to happen at a node
seg_map = tree_map(tree,seglen)
switch = [None]*(nregime-1)
for regime in range(nregime-1):
switch[regime]= make_switchpoint_stoch(seg_map, name=str("switch_{}".format(regime)))
###########################################################################
# Qparams:
###########################################################################
# Each Q parameter is an exponential
Qparams = [None] * nparam
for i in range(nparam):
Qparams[i] = pymc.Exponential(name=str("Qparam_{}".format(i)), beta=1.0, value=0.1*(i+1))
###########################################################################
# Likelihood
###########################################################################
# The likelihood function
l = cyexpokit.make_mklnl_func(tree, data,nchar,nregime,qidx)
@pymc.deterministic
def likelihood(q = Qparams, s=switch,name="likelihood"):
return l(np.array(q),np.array([x[0].ni for x in s],dtype=np.intp),np.array([x[1] for x in s]))
@pymc.potential
def multi_mklik(lnl=likelihood):
if not (np.isnan(lnl)):
return lnl
else:
return -np.inf
mod = pymc.MCMC(locals())
for s in switch:
mod.use_step_method(SwitchpointMetropolis, s, tree, seg_map,stepsize=stepsize,seglen=seglen)
return mod