Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bias #100

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
68 changes: 55 additions & 13 deletions pastis/io/read/all_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,19 @@ def _get_lengths(lengths):
"""Load chromosome lengths from file, or reformat lengths object.
"""

if isinstance(lengths, str) and os.path.exists(lengths):
lengths = load_lengths(lengths)
elif lengths is not None and (isinstance(lengths, list) or isinstance(lengths, np.ndarray)):
if len(lengths) == 1 and isinstance(lengths[0], str) and os.path.exists(lengths[0]):
lengths = load_lengths(lengths[0])
if lengths is not None:
if (isinstance(lengths, list) or isinstance(lengths, np.ndarray)) \
and len(lengths) == 1:
lengths = lengths[0]
if isinstance(lengths, str):
if os.path.exists(lengths):
lengths = load_lengths(lengths)
else:
raise ValueError("Path to lengths does not exist.")
if isinstance(lengths, np.int64):
lengths = lengths.item()
if isinstance(lengths, int):
lengths = [lengths]
lengths = np.array(lengths).astype(int)
return lengths

Expand All @@ -24,12 +32,18 @@ def _get_chrom(chrom, lengths):
"""

lengths = _get_lengths(lengths)
if isinstance(chrom, str) and os.path.exists(chrom):
chrom = np.array(np.genfromtxt(chrom, dtype='str')).reshape(-1)
elif chrom is not None and (isinstance(chrom, list) or isinstance(chrom, np.ndarray)):
if len(chrom) == 1 and isinstance(chrom[0], str) and os.path.exists(chrom[0]):
chrom = np.array(np.genfromtxt(chrom[0], dtype='str')).reshape(-1)
chrom = np.array(chrom)
if chrom is not None:
if (isinstance(chrom, list) or isinstance(chrom, np.ndarray)) \
and len(chrom) == 1:
chrom = chrom[0]
if isinstance(chrom, str):
if os.path.exists(chrom):
chrom = np.genfromtxt(chrom, dtype='str')
else:
raise ValueError("Path to chrom does not exist.")
if isinstance(chrom, np.ndarray):
chrom = [chrom].item()
chrom = np.array(chrom).reshape(-1)
else:
chrom = np.array(['num%d' % i for i in range(1, len(lengths) + 1)])
return chrom
Expand Down Expand Up @@ -60,8 +74,33 @@ def _get_counts(counts, lengths):
return output


def _get_bias(bias):
"""Load bias from file, or reformat bias object.
"""

if bias is not None:
if (isinstance(bias, list) or isinstance(bias, np.ndarray)) \
and len(bias) == 1:
bias = bias[0]
if isinstance(bias, str):
if os.path.exists(bias):
if bias.endswith(".npy"):
bias = np.load(bias)
else:
bias = np.loadtxt(bias)
else:
raise ValueError("Path to bias vector does not exist.")
if isinstance(bias, np.int64):
bias = bias.item()
if isinstance(bias, int):
bias = [bias]
bias = np.array(bias).astype(float)
return bias


def load_data(counts, lengths_full, ploidy, chrom_full=None,
chrom_subset=None, exclude_zeros=False, struct_true=None):
chrom_subset=None, exclude_zeros=False, struct_true=None,
bias=None):
"""Load all input data from files, and/or reformat data objects.

If files are provided, load data from files. Also reformats data objects.
Expand All @@ -81,6 +120,8 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None,
chrom_subset : list of str, optional
Label for each chromosome to be excised from the full data; labels of
chromosomes for which inference should be performed.
bias : str, optional
The path to the bias vector to normalize the counts with.

Returns
-------
Expand All @@ -106,6 +147,7 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None,
lengths_full = _get_lengths(lengths_full)
chrom_full = _get_chrom(chrom_full, lengths_full)
counts = _get_counts(counts, lengths_full)
bias = _get_bias(bias)

if struct_true is not None and isinstance(struct_true, str):
struct_true = np.loadtxt(struct_true)
Expand All @@ -115,4 +157,4 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None,
chrom_full=chrom_full, chrom_subset=chrom_subset,
exclude_zeros=exclude_zeros, struct_true=struct_true)

return counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true
return counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true, bias
18 changes: 11 additions & 7 deletions pastis/optimization/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ def check_counts(counts, lengths, ploidy, exclude_zeros=False,
def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize,
filter_threshold, beta=None, fullres_torm=None,
excluded_counts=None, mixture_coefs=None,
exclude_zeros=False, input_weight=None, verbose=True):
exclude_zeros=False, input_weight=None, verbose=True,
bias=None):
"""Check counts, reformat, reduce resolution, filter, and compute bias.

Preprocessing options include reducing resolution, computing bias (if
Expand Down Expand Up @@ -329,6 +330,8 @@ def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize,
therefore be removed. There should be one array per counts matrix.
excluded_counts : {"inter", "intra"}, optional
Whether to exclude inter- or intra-chromosomal counts from optimization.
bias: str, optional
The path to the bias vector to normalize the counts data with.

Returns
-------
Expand All @@ -344,7 +347,7 @@ def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize,
counts_raw, lengths=lengths, ploidy=ploidy,
multiscale_factor=multiscale_factor, normalize=normalize,
filter_threshold=filter_threshold, exclude_zeros=exclude_zeros,
verbose=verbose)
verbose=verbose, bias=bias)

lengths_lowres = decrease_lengths_res(lengths, multiscale_factor)

Expand Down Expand Up @@ -389,7 +392,7 @@ def _percent_nan_beads(counts):

def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1,
normalize=True, filter_threshold=0.04, exclude_zeros=True,
verbose=True):
verbose=True, bias=None):
"""Copy counts, check matrix, reduce resolution, filter, and compute bias.
"""

Expand Down Expand Up @@ -502,17 +505,19 @@ def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1,
individual_counts_torms = individual_counts_torms | torm
counts = sparse.coo_matrix(counts)
counts_dict[counts_type] = counts


verbose=True
# Optionally normalize counts
bias = None
if normalize:
if normalize and bias is None:
# If we have to calculate the bias
if verbose:
print('COMPUTING BIAS: all counts together', flush=True)
bias = ICE_normalization(
ambiguate_counts(
list(counts_dict.values()), lengths=lengths_lowres,
ploidy=ploidy, exclude_zeros=True),
max_iter=300, output_bias=True)[1].flatten()
if bias is not None:
# In each counts matrix, zero out counts for which bias is NaN
for counts_type, counts in counts_dict.items():
initial_zero_beads = find_beads_to_remove(
Expand All @@ -533,7 +538,6 @@ def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1,
print(' removing %d additional beads from %s' %
(torm.sum() - initial_zero_beads, counts_type),
flush=True)

output_counts = check_counts(
list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy,
exclude_zeros=exclude_zeros)
Expand Down
3 changes: 2 additions & 1 deletion pastis/optimization/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,
ua_beta = ua_counts.beta
if ua_beta is not None:
ua_beta *= multiscale_factor ** 2

struct = estimate_X(
ua_counts._counts.astype(float),
alpha=-3. if alpha is None else alpha, beta=ua_beta, verbose=False,
Expand All @@ -42,6 +42,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state,

struct = struct.reshape(-1, 3)
torm = find_beads_to_remove(counts, struct.shape[0])

struct[torm] = np.nan

return struct
Expand Down
3 changes: 2 additions & 1 deletion pastis/optimization/mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,9 @@ def estimate_X(counts, alpha=-3., beta=1., ini=None,
The structure as an ndarray of shape (n, 3).

"""

n = counts.shape[0]

random_state = check_random_state(random_state)
if ini is None or ini == "random":
ini = 1 - 2 * random_state.rand(n * 3)
Expand Down
33 changes: 20 additions & 13 deletions pastis/optimization/pastis_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0,
struct_draft_fullres=None, callback_freq=None,
callback_function=None, reorienter=None, alpha_true=None,
struct_true=None, input_weight=None, exclude_zeros=False,
null=False, mixture_coefs=None, verbose=True):
null=False, mixture_coefs=None, verbose=True, bias=None):
"""Infer draft 3D structures with PASTIS via Poisson model.
"""

Expand Down Expand Up @@ -92,7 +92,8 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0,
counts_raw=counts_raw, lengths=lengths, ploidy=ploidy,
normalize=normalize, filter_threshold=filter_threshold,
multiscale_factor=1, exclude_zeros=exclude_zeros, beta=beta,
input_weight=input_weight, verbose=False, mixture_coefs=mixture_coefs)
input_weight=input_weight, verbose=False, mixture_coefs=mixture_coefs,
bias=bias)
beta = [c.beta for c in counts if c.sum() != 0]

alpha_ = alpha
Expand All @@ -117,7 +118,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0,
reorienter=reorienter, alpha_true=alpha_true,
struct_true=struct_true, input_weight=input_weight,
exclude_zeros=exclude_zeros, null=null, mixture_coefs=mixture_coefs,
verbose=verbose)
verbose=verbose, bias=bias)
if not infer_var_fullres['converged']:
return struct_draft_fullres, alpha_, beta_, hsc_r, False
if alpha is not None:
Expand Down Expand Up @@ -176,7 +177,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0,
reorienter=reorienter, alpha_true=alpha_true,
struct_true=struct_true, input_weight=input_weight,
exclude_zeros=exclude_zeros, null=null,
mixture_coefs=mixture_coefs, verbose=verbose)
mixture_coefs=mixture_coefs, verbose=verbose, bias=bias)
if not infer_var_lowres['converged']:
return struct_draft_fullres, alpha_, beta_, hsc_r, False
hsc_r = distance_between_homologs(
Expand Down Expand Up @@ -205,7 +206,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
struct_draft_fullres=None, draft=False, simple_diploid=False,
callback_freq=None, callback_function=None, reorienter=None,
alpha_true=None, struct_true=None, input_weight=None,
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True):
exclude_zeros=False, null=False, mixture_coefs=None, verbose=True,
bias=None):
"""Infer 3D structures with PASTIS via Poisson model.

Optimize 3D structure from Hi-C contact counts data for diploid
Expand Down Expand Up @@ -298,6 +300,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
For diploid organisms: whether this optimization is inferring a "simple
diploid" structure in which homologs are assumed to be identical and
completely overlapping with one another.
bias: str, optional
The path to the bias vector to normalize the counts data with.

Returns
-------
Expand Down Expand Up @@ -364,7 +368,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
reorienter=reorienter, alpha_true=alpha_true,
struct_true=struct_true, input_weight=input_weight,
exclude_zeros=exclude_zeros, null=null, mixture_coefs=mixture_coefs,
verbose=verbose)
verbose=verbose, bias=bias)
if not draft_converged:
return None, {'alpha': alpha_, 'beta': beta_, 'seed': seed,
'converged': draft_converged}
Expand Down Expand Up @@ -408,7 +412,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
filter_threshold=filter_threshold, multiscale_factor=multiscale_factor,
exclude_zeros=exclude_zeros, beta=beta_, input_weight=input_weight,
verbose=verbose, fullres_torm=fullres_torm,
excluded_counts=excluded_counts, mixture_coefs=mixture_coefs)
excluded_counts=excluded_counts, mixture_coefs=mixture_coefs, bias=bias)
if verbose:
print('BETA: %s' % ', '.join(
['%s=%.3g' % (c.ambiguity, c.beta) for c in counts if c.sum() != 0]),
Expand Down Expand Up @@ -578,7 +582,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0,
callback_freq=callback_freq, reorienter=reorienter,
alpha_true=alpha_true, struct_true=struct_true,
input_weight=input_weight, exclude_zeros=exclude_zeros,
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias)
if not infer_var['converged']:
return struct_, infer_var
if reorienter is not None and reorienter.reorient:
Expand All @@ -605,7 +609,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
piecewise_step1_accuracy=1, alpha_true=None,
struct_true=None, init='mds', input_weight=None,
exclude_zeros=False, null=False, mixture_coefs=None,
verbose=True):
verbose=True, bias=None):
"""Infer 3D structures with PASTIS via Poisson model.

Infer 3D structure from Hi-C contact counts data for haploid or diploid
Expand Down Expand Up @@ -681,6 +685,8 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
homolog-separating constraint specificying the expected mean inter-
homolog count for each chromosome, scaled by beta and biases. If
not supplied, `mhs_k` will be estimated from the counts data.
bias : str, optional
The path to the bias vector to normalize the counts with

Returns
-------
Expand All @@ -706,10 +712,11 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
callback_freq = {'print': print_freq, 'history': history_freq,
'save': save_freq}

counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true = load_data(
counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true, bias = load_data(
counts=counts, lengths_full=lengths_full, ploidy=ploidy,
chrom_full=chrom_full, chrom_subset=chrom_subset,
exclude_zeros=exclude_zeros, struct_true=struct_true)
exclude_zeros=exclude_zeros, struct_true=struct_true,
bias=bias)

outdir = _output_subdir(
outdir=outdir, chrom_full=chrom_full, chrom_subset=chrom_subset,
Expand All @@ -731,7 +738,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
callback_function=callback_function, callback_freq=callback_freq,
alpha_true=alpha_true, struct_true=struct_true,
input_weight=input_weight, exclude_zeros=exclude_zeros,
null=null, mixture_coefs=mixture_coefs, verbose=verbose)
null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias)
else:
from .piecewise_whole_genome import infer_piecewise

Expand All @@ -757,7 +764,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None,
piecewise_step1_accuracy=piecewise_step1_accuracy,
alpha_true=alpha_true, struct_true=struct_true, init=init,
input_weight=input_weight, exclude_zeros=exclude_zeros, null=null,
mixture_coefs=mixture_coefs, verbose=verbose)
mixture_coefs=mixture_coefs, verbose=verbose, bias=bias)

if verbose:
if infer_var['converged']:
Expand Down
Loading