Analysis of protein sequences. More...

#include "seq_analysis.h"
#include "seq_util.h"
#include "ps_sequence.h"
#include "moving_average.h"
#include "linked_list.h"
#include "utilities.h"

Functions
double	log_2 (double a)

long	seq_limit (Bmolgroup *molgroup, Bstring &refseq)
	Limits the selection to the reference sequence in an aligned set. More...

Matrix	seq_aligned_identity (Bmolgroup *molgroup)
	Calculates the pairwise identities between aligned sequences. More...

Matrix	seq_aligned_similarity (Bmolgroup molgroup, double threshold, Bresidue_matrix simat)
	Calculates the pairwise similarities between aligned sequences. More...

long	seq_select (Bmolgroup *molgroup, long minlen, long maxlen)
	Selects sequences within a range of lengths. More...

long	seq_select (Bmolgroup *molgroup, Matrix mat, long ref, double cutoff)
	Selects sequences based on a comparison matrix of aligned sequences. More...

long	seq_delete (Bmolgroup *molgroup, Matrix mat)
	Deletes non-selected sequences and corresponding elelments of a comparison matrix. More...

string	seq_aligned_profile (Bmolgroup *molgroup)
	Generates a PROSITE format profile from an aligned set of sequences. More...

int	seq_aligned_information (Bmolgroup *molgroup, int window, Bstring &psfile)
	Calculates the sequence logo representation for an alignment. More...

int	seq_aligned_hydrophobicity (Bmolgroup *molgroup, int window, double threshold, Bstring &hphobfile, Bstring &psfile)
	Calculates the average hydrophobicity at every position in an alignment. More...

vector< Complex< float > >	seq_frequency_analysis (long win, long start, long end, vector< double > &data)
	Fourier transforms a vector for frequency analysis. More...

vector< double >	seq_aligned_weight (Bmolgroup *molgroup)

Matrix	seq_correlated_mutation (Bmolgroup *molgroup, Bstring &refseqid, double cutoff, Bstring &simfile)
	Correlated mutation analysis of an alignment. More...

Variables
int	verbose

Detailed Description

Analysis of protein sequences.

Author: Bernard Heymann

Date: Created: 19990123; Modified: 20210426

Function Documentation

◆ log_2()

double log_2 ( double a )

◆ seq_aligned_hydrophobicity()

int seq_aligned_hydrophobicity	(	Bmolgroup *	molgroup,
		int	window,
		double	threshold,
		Bstring &	hphobfile,
		Bstring &	psfile
	)

Calculates the average hydrophobicity at every position in an alignment.

Parameters

*molgroup	the set of sequences.
window	moving average window.
threshold	fraction of sequences with a residue in a position.
&hphobfile	parameter file.
&psfile	postscript output file.

Returns: int 0.

The default hydrophobicity scale is the GES scale.

◆ seq_aligned_identity()

Matrix seq_aligned_identity ( Bmolgroup * molgroup )

Calculates the pairwise identities between aligned sequences.

Parameters

*molgroup the set of sequences.

Returns: Matrix the matrix of identities.

The identity between two sequences is defined as: number of identical residues identity = -------------------------— overlap where the overlap is the number of positions with residues in both sequences.

◆ seq_aligned_information()

int seq_aligned_information	(	Bmolgroup *	molgroup,
		int	window,
		Bstring &	psfile
	)

Calculates the sequence logo representation for an alignment.

Parameters

*molgroup	the set of sequences.
window	window for calculating the moving average.
&psfile	the postscript file name.

Returns: int 0.

The information content of each position in an alignment is calculated as: information = log_2(n) - sum(pi * log_2(pi) ) fi pi = ----— sum(fi) fi = frequency of residue type i at this position n = sum(fi) if sum(fi) < 20, otherwise n = 20 A moving average of the information is calculated over a given window to smooth the resultant data. The sequence logo representation for the occurrence of every residue type at every position is generated and written into a postscript file.

◆ seq_aligned_profile()

string seq_aligned_profile ( Bmolgroup * molgroup )

Generates a PROSITE format profile from an aligned set of sequences.

Parameters

*molgroup the set of sequences.

Returns: string profile in PROSITE format.

At each position in the alignment, the number of distinct residue types are counted. If there are more than 3 residue types represented at a position, or there is a gap, it is designated as variable by an "x". The profile finally contains 1-3 residue type possibilities for highly conserved positions interspersed by variable length gaps.

◆ seq_aligned_similarity()

Matrix seq_aligned_similarity	(	Bmolgroup *	molgroup,
		double	threshold,
		Bresidue_matrix *	simat
	)

Calculates the pairwise similarities between aligned sequences.

Parameters

*molgroup	the set of sequences.
threshold	threshold to accept residues as similar.
*simat	residue similarity matrix.

Returns: Matrix the matrix of similarities.

The similarity between two sequences is defined as: sum(residue similarity) similarity = --------------------— overlap number of residues with similarity > threshold fraction similarity = -------------------------------------------— overlap where the overlap is the number of positions with residues in both sequences. The residue similarity is taken from a residue substitution matrix. The default substitution matrix is BLOSUM62.

◆ seq_aligned_weight()

vector< double > seq_aligned_weight ( Bmolgroup * molgroup )

◆ seq_correlated_mutation()

Matrix seq_correlated_mutation	(	Bmolgroup *	molgroup,
		Bstring &	refseqid,
		double	cutoff,
		Bstring &	simfile
	)

Correlated mutation analysis of an alignment.

Parameters

*molgroup	the set of aligned sequences.
refseqid	reference sequence to report on.
cutoff	cutoff for reporting correlated mutations.
&simfile	similarity matrix file.

Returns: Matrix the analysis result matrix.

Reference: Gobel, Sander & Schneider (1994) Proteins 18, 309-317. Mutation (residue variation) correlation is defined as: 1 r(i,j) = ----------— sum(w(k,l)*(s(i,k,l) - <s(i)>)*(s(j,k,l) - <s(j)>)) m^2*o(i)*o(j) where: m: number of sequences o(i): standard deviation of similarities at alignment position i w(k,l): weight for sequences k and l (1 - fractional identity: see function seq_aligned_identity) s(i,k,l): similarity for alignment position i between sequences k and l <s(i)>: average similarity at alignment position i Individual high-scoring correlations (using the given cutoff value) are reported as follows: Res1 Num1 Res2 Num2 Total Corr T 9 I 17 210 0.631 TAIIIVVVIVVVIVIIIIIII IILLLLLLLLLLLLLLLLLLL The first 4 values gives the type and alignment position of the correlating residues. The total is the number of comparisons made: maximally m*(m-1)/2 The last number is the correlation coefficient. The following two lines gives the corresponding residues at the two alignment positions for all the sequences, allowing the user to see on what basis this is a high correlation.

◆ seq_delete()

long seq_delete	(	Bmolgroup *	molgroup,
		Matrix	mat
	)

Deletes non-selected sequences and corresponding elelments of a comparison matrix.

Parameters

*molgroup	the set of sequences.
mat	comparison matrix.

Returns: long number of sequences retained.

◆ seq_frequency_analysis()

vector< Complex< float > > seq_frequency_analysis	(	long	win,
		long	start,
		long	end,
		vector< double > &	data
	)

Fourier transforms a vector for frequency analysis.

Parameters

win	window size.
start	start within window.
end	end within window.
*data	sequence.

Returns: int 0.

A brute force Fourier transform is done.

◆ seq_limit()

long seq_limit	(	Bmolgroup *	molgroup,
		Bstring &	refseq
	)

Limits the selection to the reference sequence in an aligned set.

Parameters

*molgroup	the set of sequences.
&refseq	reference sequence identifier.

Returns: long number of selected residues.

◆ seq_select() [1/2]

long seq_select	(	Bmolgroup *	molgroup,
		long	minlen,
		long	maxlen
	)

Selects sequences within a range of lengths.

Parameters

*molgroup	the set of sequences.
minlen	minimum length.
maxlen	maximum length.

Returns: long number of sequences retained.

◆ seq_select() [2/2]

long seq_select	(	Bmolgroup *	molgroup,
		Matrix	mat,
		long	ref,
		double	cutoff
	)

Selects sequences based on a comparison matrix of aligned sequences.

Parameters

*molgroup	the set of sequences.
mat	comparison matrix.
ref	reference sequence number (starting at 1).
cutoff	threshold for selecting sequences.

Returns: long number of sequences retained.

Variable Documentation

◆ verbose

int verbose

extern

Functions

Variables

Detailed Description

Function Documentation

◆ log_2()

◆ seq_aligned_hydrophobicity()

◆ seq_aligned_identity()

◆ seq_aligned_information()

◆ seq_aligned_profile()

◆ seq_aligned_similarity()

◆ seq_aligned_weight()

◆ seq_correlated_mutation()

◆ seq_delete()

◆ seq_frequency_analysis()

◆ seq_limit()

◆ seq_select() [1/2]

◆ seq_select() [2/2]

Variable Documentation

◆ verbose