/* GNU polyxmass - the massist's program.
   -------------------------------------- 
   Copyright (C) 2000,2001,2002,2003,2004 Filippo Rusconi

   http://www.polyxmass.org

   This file is part of the "GNU polyxmass" project.
   
   The "GNU polyxmass" project is an official GNU project package (see
   www.gnu.org) released ---in its entirety--- under the GNU General
   Public License and was started at the Centre National de la
   Recherche Scientifique (FRANCE), that granted me the formal
   authorization to publish it under this Free Software License.

   This software is free software; you can redistribute it and/or
   modify it under the terms of the GNU  General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.
   
   This software is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.
   
   You should have received a copy of the GNU  General Public
   License along with this software; if not, write to the
   Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.
*/
/*  file pxmchem-cleavespec.h 
 * which contains definition of the 
 * cleavespec structure and functions related to it
 */

#ifndef PXMCHEM_CLEAVESPEC_H
#define PXMCHEM_CLEAVESPEC_H

#include "libpolyxmass-globals.h"
#include "pxmchem-polchemdef.h"



#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */


/* typedef's
 */
typedef struct _PxmCleaveSpec PxmCleaveSpec;
typedef struct _PxmCleaveRule PxmCleaveRule;

typedef struct _PxmCleavemotif PxmCleavemotif;


struct _PxmCleaveSpec
{
  gchar *name;

  /* pattern is a string of unlimited length that is dynamically
   * allocated to hold the polymer sequence pattern that will
   * guide the "cleaving object" to the right polymer sequence. For
   * example, if we still think of proteins and proteases, we might
   * define for str_name "Trypsin" a str_pattern "Lys/;Arg/;-Lys/Pro",
   * since we want that the Trypsin "cleavage object" cut right of
   * lysine and arginine monomers, but that it does not cleave when
   * the lysine is immediately followed by a proline monomer.
   * As shown in this example, the syntax is simple : the '/' character
   * symbolizes the position of the chemical cleavage in the sequence.
   * There can be as many '/' character in a str_pattern, but only one
   * per sequence element (separation of elements is done with the ';'
   * character. The sequence element in question is actually called
   * a "motif" (see below), like in our example, "Lys" or "LysPro"
   * would be.
   */
  gchar *pattern;

  /* When a cleavespec is used, the first steps is to parse its
   * pattern member and extract from it all the motifs that are in it.
   * For example "Lys/;Arg/;-Lys/Pro" is a pattern that should be
   * parsed and decomposed into a number of 'site's: "Lys/" and "Arg/"
   * and "-Lys/Pro". These textual 'site's should later be
   * deconstructed into structured data: each textual 'site' is
   * deconstructed into a cleavemotif object which has two
   * representations of a given 'site'. A first representation is
   * merely a string: the cleavemotif's 'motif' member, like "LysPro",
   * for example, if a given 'site' were "Lys/Pro". Another member of
   * the cleavemotif instance is the array of monomer codes that
   * represent the motif. In our case the cleavemotif would have an
   * array of which the first element would be an allocated string
   * "Lys" and the second another allocated string "Pro". In other
   * words, the array is a vertical tabular representation of the
   * horizontal representation "LysPro" of the same motif. The
   * cleavemotif's offset member would tell where the cleavage (if
   * any) should occur. For our example "Lys/Pro", the offset would be
   * 1, but for a motif "/D", it would be 0. And finally, cleave is a
   * boolean that says if the cleavage should occur or not. For
   * example, parsing the 'site' "-Lys/Pro" would generate a FALSE
   * value for variable cleave, since the '-' indicates that cleavage
   * should not occur.
   */
  GPtrArray *clmGPA;
  
  /* clrGPA is an array of PxmCleaveRule instances to hold
   * specifications about the way the left/right caps of the resulting
   * oligomer have to be processed. Indeed, when hydrolyzing a
   * polymer, the oligomers thus generated are capped with "H" in one
   * end ("cap") and with "OH" in the other cap.  This is why the
   * cutting operation is called "hydrolysis".  But sometimes, the
   * caps of the oligomers are not "H" and "OH", in particular when
   * the "lyzing factor" is not water. The PxmCleaveRule struct is
   * there to allow taking into account just this. You can apply a
   * chemical rule to the polymer generated upon chemical cleavage of
   * the polymer.
   */
  GPtrArray *clrGPA;
};


struct _PxmCleaveRule
{
  gchar *left_code;

  gchar *left_actform;	

  gchar *right_code;

  gchar *right_actform;	
};



struct _PxmCleavemotif
{
  /* 'motif' is a string to hold the precise description of the motif
   * at which one cleavage must occur. For example, if a cleavespec
   * pattern was "K/;R/;-K/P", this pattern would be deconstructed
   * into three different cleavemotif objects. Each cleavemotif object
   * would have a different 'motif' member: "K" or "R" or "-KP". This
   * horizontal representation needs to be tabulated vertically
   * because some polymer definitions allow monomer codes to be more
   * than one letter-long. This is done by putting each monomer code
   * forming a given motif into an array of strings: motifGPA. We call
   * this array motifGPA because it represents motif strings. Indeed,
   * we make a big difference between: a pattern: "Lys/;Arg/;-Lys/Pro"
   * a site: "-Lys/Pro" a motif: "LysPro".
   *
   * What we see is that a motif is less informative than a site,
   * since it has lost the cleavage position indicator '/' and the
   * cleave/no-cleave indicator '-'. But these two bits of information
   * get transformed into two members of the cleavemotif: the offset
   * indicates where the cleavage occurs in the site ("Lys/Pro" would
   * yield a 1 offset, "/D" would yield a 0 offset, for example) and
   * the cleave would indicate if the cleavage must be performed or
   * not (a '-'-prefixed site means that no cleavage must be performed
   * and cleavage = FALSE).
   */
  gchar *motif;

  /* Wee mentioned above that "K" or "R" or "-KP" are three motif
   * strings that would need to be represented vertically in an
   * array. For the last, for example, "KP", it would be deconstructed
   * further into two allocated strings in motifGPA: motifGPA[0]="K"
   * and motifGPA[1]="P". When the monomer codes are 1 letter-long,
   * this does not seem very useful, but what about using the standard
   * 3 lett-code for this same example: motifGPA[0]="Lys" and
   * motifGPA[1]="Pro". All this starts to make sense ?
   */
  GPtrArray *motifGPA;
  
  /* offset represent the coordinate at which the cleavage should
   * happen in a site. For example, if cleavage should occur right of
   * a K, the motif would be "K/" and offset would be 1 (the slash is
   * right of the 'K'. If the motif were "/D", the offset would be
   * 0. Finally, if we had a site "GRKK/RKGLI", the offset would be 4.
   */
  gint offset;
  
  /* Sometimes it may be necessary to flag a motif in such a way that
   * the program knows that the cleavage should not occur at this precise
   * location. This is typically the case when motives say: "-K/P", which
   * translates to "do not cleave right of K if K is immediately
   * followed by a P. The '-' sign tells that cleavage should not occur,
   * which translates into value FALSE being set to the variable 'cleave'.
   */
  gboolean cleave;
};


/* NEW'ING FUNCTIONS, DUPLICATING FUNCTIONS, INITING FUNCTIONS ...
 */
PxmCleaveSpec *
pxmchem_cleavespec_new (void);


PxmCleaveRule * 
pxmchem_cleaverule_new (void);


PxmCleavemotif *
pxmchem_cleavemotif_new (void);


PxmCleaveSpec *
pxmchem_cleavespec_new_by_name (gchar *name, GPtrArray *GPA);


PxmCleaveSpec *
pxmchem_cleavespec_new_by_pattern (gchar *pattern, GPtrArray *GPA);


PxmCleaveSpec *
pxmchem_cleavespec_dup (const PxmCleaveSpec *cls);


PxmCleaveRule * 
pxmchem_cleaverule_dup (const PxmCleaveRule *clr);


gboolean
pxmchem_cleaverule_set_left_code (PxmCleaveRule *clr,
				     gchar *left_code);
  

gboolean
pxmchem_cleaverule_set_right_code (PxmCleaveRule *clr,
				      gchar *right_code);
  

gboolean
pxmchem_cleaverule_set_left_actform (PxmCleaveRule *clr,
					gchar *left_actform);
  

gboolean
pxmchem_cleaverule_set_right_actform (PxmCleaveRule *clr,
					 gchar *right_actform);
  
PxmCleavemotif *
pxmchem_cleavemotif_dup (const PxmCleavemotif *clm);


gboolean
pxmchem_cleavespec_set_name (PxmCleaveSpec *cls,
			     gchar *name);
  
gboolean
pxmchem_cleavespec_set_pattern (PxmCleaveSpec *cls,
				gchar *pattern);

gboolean
pxmchem_cleavemotif_set_motif (PxmCleavemotif *clm,
			      gchar *motif);
  
gboolean
pxmchem_cleavemotif_set_offset (PxmCleavemotif *clm,
				gint offset);
  
gboolean
pxmchem_cleavemotif_set_cleave (PxmCleavemotif *clm,
				gboolean cleave);

  

/* INTEGRITY CHECKING FUNCTIONS
 */
gboolean
pxmchem_cleavespec_validate (PxmCleaveSpec *cleavespec, gchar *delim_codes,
			     gint codelen, GPtrArray *atom_refGPA, 
			     gchar **valid);

gboolean
pxmchem_cleavespec_unique_by_name (PxmCleaveSpec *cleavespec, 
				   GPtrArray *GPA);

gboolean
pxmchem_cleavespec_check_syntax (PxmCleaveSpec *cleavespec, 
				 gchar *delim_codes,
				 gint codelen);
  
gboolean
pxmchem_cleaverule_validate (PxmCleaveRule *clr, gchar *delim_codes, 
				GPtrArray *atom_refGPA, gchar **valid);




/*  LOCATING FUNCTIONS
 */
gint
pxmchem_cleavespec_get_index_by_name (gchar *name, GPtrArray *GPA);


gint
pxmchem_cleavespec_get_index_top_by_name (gchar *name, GPtrArray *GPA);


gint
pxmchem_cleavespec_get_index_bottom_by_name (gchar *name, GPtrArray *GPA);


gint
pxmchem_cleavespec_get_index_by_pattern (gchar *pattern, GPtrArray *GPA);


gint
pxmchem_cleavespec_get_index_by_ptr (GPtrArray *GPA, 
				 PxmCleaveSpec *cls);


PxmCleaveSpec *
pxmchem_cleavespec_get_ptr_by_name (gchar *name, GPtrArray *GPA);


PxmCleaveSpec *
pxmchem_cleavespec_get_ptr_by_pattern (gchar *pattern, GPtrArray *GPA);



/* UTILITY FUNCTIONS
 */



/* XML-format TRANSACTIONS
 */
gchar *
pxmchem_cleavespec_format_xml_string_cls (PxmCleaveSpec *cls, 
				      gchar *indent, gint offset);
  

PxmCleaveSpec *
pxmchem_cleavespec_render_xml_node_cls (xmlDocPtr xml_doc,
				    xmlNodePtr xml_node,
				    gpointer user_data);

gchar *
pxmchem_cleaverule_format_xml_string_clr (PxmCleaveRule *clr, 
					 gchar *indent, gint offset);
  

PxmCleaveRule *
pxmchem_cleaverule_render_xml_node_clr (xmlDocPtr xml_doc,
				       xmlNodePtr xml_node,
				       gpointer user_data);




/* FREE'ING FUNCTIONS
 */
gboolean
pxmchem_cleavespec_free (PxmCleaveSpec *cls);

gboolean
pxmchem_cleaverule_free (PxmCleaveRule *clr);

gboolean
pxmchem_cleavemotif_free (PxmCleavemotif *clm);



/* GPtrArray-RELATED FUNCTIONS
 */
gint
pxmchem_cleavespec_GPA_free (GPtrArray *GPA);

gint
pxmchem_cleaverule_GPA_free (GPtrArray *GPA);

gint
pxmchem_cleavemotif_GPA_free (GPtrArray *GPA);





#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* PXMCHEM_CLEAVESPEC_H */
