<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" article-type="research-article">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">genes</journal-id>
      <journal-title>Genes</journal-title>
      <abbrev-journal-title abbrev-type="publisher">Genes</abbrev-journal-title>
      <abbrev-journal-title abbrev-type="pubmed">Genes</abbrev-journal-title>
      <issn pub-type="epub">2073-4425</issn>
      <publisher>
        <publisher-name>MDPI</publisher-name>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.3390/genes3030545</article-id>
      <article-id pub-id-type="publisher-id">genes-03-00545</article-id>
      <article-categories>
        <subj-group>
          <subject>Article</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Next Generation Sequence Analysis and Computational Genomics Using Graphical Pipeline Workflows</article-title>
      </title-group>
     
      <contrib-group>
        <contrib contrib-type="author">
          <name>
            <surname>Torri</surname>
            <given-names>Federica</given-names>
          </name>
          <xref rid="af1-genes-03-00545" ref-type="aff">1</xref>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Dinov</surname>
            <given-names>Ivo D.</given-names>
          </name>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Zamanyan</surname>
            <given-names>Alen</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Hobel</surname>
            <given-names>Sam</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Genco</surname>
            <given-names>Alex</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Petrosyan</surname>
            <given-names>Petros</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Clark</surname>
            <given-names>Andrew P.</given-names>
          </name>
          <xref rid="af4-genes-03-00545" ref-type="aff">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Liu</surname>
            <given-names>Zhizhong</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Eggert</surname>
            <given-names>Paul</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
          <xref rid="af5-genes-03-00545" ref-type="aff">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Pierce</surname>
            <given-names>Jonathan</given-names>
          </name>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Knowles</surname>
            <given-names>James A.</given-names>
          </name>
          <xref rid="af4-genes-03-00545" ref-type="aff">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Ames</surname>
            <given-names>Joseph</given-names>
          </name>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Kesselman</surname>
            <given-names>Carl</given-names>
          </name>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Toga</surname>
            <given-names>Arthur W.</given-names>
          </name>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
          <xref rid="af3-genes-03-00545" ref-type="aff">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Potkin</surname>
            <given-names>Steven G.</given-names>
          </name>
          <xref rid="af1-genes-03-00545" ref-type="aff">1</xref>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Vawter</surname>
            <given-names>Marquis P.</given-names>
          </name>
          <xref rid="af6-genes-03-00545" ref-type="aff">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Macciardi</surname>
            <given-names>Fabio</given-names>
          </name>
          <xref rid="af1-genes-03-00545" ref-type="aff">1</xref>
          <xref rid="af2-genes-03-00545" ref-type="aff">2</xref>
          <xref rid="c1-genes-03-00545" ref-type="corresp">*</xref>
        </contrib>
      </contrib-group>
       <aff id="af1-genes-03-00545"><label>1 </label>Department of Psychiatry and Human Behavior, University of California, Irvine, CA 92617, USA; E-Mails: <email>ftorri@uci.edu</email> (F.T.); <email>sgpotkin@uci.edu</email> (S.G.P.)</aff>
      <aff id="af2-genes-03-00545"><label>2 </label>Biomedical Informatics Research Network (BIRN), Information Sciences Institute, University of Southern California, Los Angeles, CA 90292, USA; E-Mails: <email>ivo.dinov@loni.ucla.edu</email> (I.D.D.); <email>jdames@uci.edu</email> (J.A.); <email>carl@isi.edu</email> (C.K.); <email>toga@loni.ucla.edu</email> (A.W.T.)</aff>
      <aff id="af3-genes-03-00545"><label>3 </label>Laboratory of Neuro Imaging (LONI), University of California, Los Angeles, CA 90095, USA; E-Mails: <email>Alen.Zamanyan@loni.ucla.edu</email> (A.Z.); <email>shobel87@gmail.com</email> (S.H.); <email>alexgenco@gmail.com</email> (A.G.); <email>Petros.Petrosyan@loni.ucla.edu</email> (P.P.); <email>zhizhong.liu@loni.ucla.edu</email> (Z.L.); <email>eggert@cs.ucla.edu</email> (P.E.); <email>jonathan.pierce@loni.ucla.edu</email> (J.P.)</aff>
      <aff id="af4-genes-03-00545"><label>4 </label>Zilkha Neurogenetic Institute, USC Keck School of Medicine, Los Angeles, CA 90033, USA; E-Mails: <email>clarkap@usc.edu</email> (A.P.C.); <email>knowles@med.usc.edu</email> (J.A.K.)</aff>
      <aff id="af5-genes-03-00545"><label>5 </label>Department of Computer Science, University of California, Los Angeles, CA 90095, USA</aff>
      <aff id="af6-genes-03-00545"><label>6 </label>Functional Genomics Laboratory, Department of Psychiatry And Human Behavior, School of Medicine, University of California, Irvine, CA 92697, USA; E-Mail: <email>mvawter@uci.edu</email></aff>
      <author-notes>
        <corresp id="c1-genes-03-00545"><label>*</label> Author  to whom correspondence should be addressed; E-Mail: <email>fmacciar@uci.edu</email>; Tel.: +1-949-824-4559; Fax: +1-949-824-2072.</corresp>
      </author-notes>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>08</month>
        <year>2012</year>
      </pub-date>
      <pub-date pub-type="collection"><month>09</month>
        <year>2012</year>
      </pub-date>
      <volume>3</volume>
      <issue>3</issue>
      <fpage>545</fpage>
      <lpage>575</lpage>
      <history>
        <date date-type="received">
          <day>06</day>
          <month>07</month>
          <year>2012</year>
        </date>
        <date date-type="rev-recd">
          <day>15</day>
          <month>08</month>
          <year>2012</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>08</month>
          <year>2012</year>
        </date>
      </history>
      <permissions>
        <copyright-statement>©  2012 by the authors; licensee MDPI, Basel, Switzerland.</copyright-statement>
        <copyright-year>2012</copyright-year>
        <license xmlns:xlink="http://www.w3.org/1999/xlink" license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/3.0/">
          <p>This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution license (http://creativecommons.org/licenses/by/3.0/).</p>
        </license>
      </permissions>
      <abstract>
        <p>Whole-genome and exome sequencing have already proven to be essential and powerful methods to identify genes responsible for simple Mendelian inherited disorders. These methods can be applied to complex disorders as well, and have been adopted as one of the current mainstream approaches in population genetics. These achievements have been made possible by next generation sequencing (NGS) technologies, which require substantial bioinformatics resources to analyze the dense and complex sequence data. The huge analytical burden of data from genome sequencing might be seen as a bottleneck slowing the publication of NGS papers at this time, especially in psychiatric genetics. We review the existing methods for processing NGS data, to place into context the rationale for the design of a computational resource. We describe our method, the Graphical Pipeline for Computational Genomics (GPCG), to perform the computational steps required to analyze NGS data. The GPCG implements flexible workflows for basic sequence alignment, sequence data quality control, single nucleotide polymorphism analysis, copy number variant identification, annotation, and visualization of results. These workflows cover all the analytical steps required for NGS data, from processing the raw reads to variant calling and annotation. The current version of the pipeline is freely available at <uri>http://pipeline.loni.ucla.edu</uri>. These applications of NGS analysis may gain clinical utility in the near future (e.g., identifying miRNA signatures in diseases) when the bioinformatics approach is made feasible. Taken together, the annotation tools and strategies that have been developed to retrieve information and test hypotheses about the functional role of variants present in the human genome will help to pinpoint the genetic risk factors for psychiatric disorders.</p>
      </abstract>
      <kwd-group>
        <kwd>Next Generation Sequencing (NGS)</kwd>
        <kwd>LONI pipeline</kwd>
        <kwd>SNPs</kwd>
        <kwd>CNVs</kwd>
        <kwd>workflow</kwd>
        <kwd>bioinformatics</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec>
      <title>1. Review of the Current Methodologies and Tools for NGS DNA-Sequencing Data Analysis</title>
      <p>The power and widespread availability of next-generation sequencing (NGS) platforms, has significantly broadened the scale of many DNA-sequencing (DNA-Seq) applications, from detecting single nucleotide polymorphisms (SNPs) [<xref ref-type="bibr" rid="B1-genes-03-00545">1</xref>] or copy number variations (CNVs) [<xref ref-type="bibr" rid="B2-genes-03-00545">2</xref>], to assembling (new) genomes or transcriptomes [<xref ref-type="bibr" rid="B3-genes-03-00545">3</xref>], developing quantitative RNA-sequencing (RNA-Seq) analysis [<xref ref-type="bibr" rid="B4-genes-03-00545">4</xref>], or detecting epigenetic changes [<xref ref-type="bibr" rid="B5-genes-03-00545">5</xref>]. Among many various NGS applications, we focus this review on the existing methods for processing NGS DNA-Seq data.</p>
      <p>NGS technology allows sequencing short fragments of DNA across the whole genome, producing single end (SE) or paired end (PE) reads of 50–700 base-pairs (bp). The reads might need some pre-processing conversion step (e.g., conversion between solexa and fastq format for data produced with version of the Illumina Pipeline previous than 1.8). The resulting raw DNA-Seq read data must then be analyzed following two computational macro-processes: (1) mapping and assembling, quality control, quality score re-calibration, realignment in “difficult” regions of the genome; and (2) advanced steps focused on variant calling (SNPs, insertions-deletions (Indels) and CNVs) and annotation. These macro-processes are briefly reviewed to provide a background for the software algorithms embedded in NGS analysis. The main software involved in NGS DNA-Seq are reviewed in <xref ref-type="table" rid="genes-03-00545-t001">Table 1</xref> and briefly described below.</p>
      <table-wrap id="genes-03-00545-t001" position="float">
        <object-id pub-id-type="pii">genes-03-00545-t001_Table 1</object-id>
        <label>Table 1</label>
        <caption>
          <p>Review of the most used software in next-generation sequencing (NGS) data analysis. Which includes two major computational macro-processes: (1) a primary step related to mapping and assembling, with alignment quality control, quality score re-calibration, realignment in “difficult” regions of the genome; and (2) secondary, advanced steps focused on variant (single nucleotide polymorphisms (SNPs), insertions-deletions (Indels) and copy number variations (CNVs)) calling and annotation. These macro-processes are briefly reviewed to provide a background for the software algorithms embedded in DNA-Seq analysis.</p>
        </caption>
                <table rules="all" style="border: solid thin">
          <thead>
            <tr align="center" style="background: black">
              <th valign="middle" style="color: white">Process</th>
              <th valign="middle" style="color: white">Software &amp; Algorithms</th>
              <th valign="middle" style="color: white">Website</th>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td align="left" valign="middle">
                <bold>Preprocessing step</bold>
              </td>
              <td align="left" valign="middle">homemade script</td>
              <td align="left" valign="middle">(N/A)</td>
            </tr>
            <tr>
              <td rowspan="8" align="left" valign="middle">
                <bold>(1.1) Alignment</bold>
              </td>
              <td align="left" valign="middle">MAQ</td>
              <td align="left" valign="middle"><uri>http://maq.sourceforge.net</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">BWA</td>
              <td align="left" valign="middle"><uri>http://bio-bwa.sourceforge.net/bwa.shtml</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">BWA-SW (SE only)</td>
              <td align="left" valign="middle"><uri>http://bio-bwa.sourceforge.net/bwa.shtml</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">PERM</td>
              <td align="left" valign="middle"><uri>http://code.google.com/p/perm/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">BOWTIE</td>
              <td align="left" valign="middle"><uri>http://bowtie-bio.sourceforge.net</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SOAPv2</td>
              <td align="left" valign="middle"><uri>http://soap.genomics.org.cn</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">MOSAIK</td>
              <td align="left" valign="middle"><uri>http://bioinformatics.bc.edu/marthlab/Mosaik</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">NOVOALIGN</td>
              <td align="left" valign="middle"><uri>http://www.novocraft.com/</uri></td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <bold>(1.2) <italic>De novo</italic> Assembly</bold>
              </td>
              <td align="left" valign="middle">VELVET</td>
              <td align="left" valign="middle"><uri>http://www.ebi.ac.uk/%7Ezerbino/velvet</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SOAPdenovo</td>
              <td align="left" valign="middle"><uri>http://soap.genomics.org.cn</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">ABYSS</td>
              <td align="left" valign="middle"><uri>http://www.bcgsc.ca/platform/bioinfo/software/abyss</uri></td>
            </tr>
            <tr>
              <td rowspan="2" align="left" valign="middle">
                <bold>(1.3) Basic QC</bold>
              </td>
              <td align="left" valign="middle">SAMTOOLS</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/SAMtools/files/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">PICARD</td>
              <td align="left" valign="middle"><uri>http://picard.sourceforge.net/command-line-overview.shtml</uri></td>
            </tr>
            <tr>
              <td rowspan="4" align="left" valign="middle">
                <bold>(1.4) Advanced QC</bold>
              </td>
              <td align="left" valign="middle">GATK</td>
              <td align="left" valign="middle"><uri>http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">PICARD</td>
              <td align="left" valign="middle"><uri>http://picard.sourceforge.net/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SAMTOOLS</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/SAMtools/files/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">IGVtools</td>
              <td align="left" valign="middle"><uri>http://www.broadinstitute.org/igv/igvtools</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>(2.1a) Variant Calling and annotation</bold>
              </td>
              <td align="left" valign="middle"> </td>
              <td align="left" valign="middle"> </td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <italic>Sequence Variant Analyzer v1.0, for hg18 annotations</italic>
              </td>
              <td align="left" valign="middle">SVA</td>
              <td align="left" valign="middle"><uri>http://www.svaproject.org/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SAMTOOLS</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/SAMtools/files/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">ERDS</td>
              <td align="left" valign="middle"><uri>http://www.duke.edu/~mz34/erds.htm</uri></td>
            </tr>
            <tr>
              <td rowspan="2" align="left" valign="middle">
                <italic>SAMTOOLS and ANNOVAR for annotation</italic>
              </td>
              <td align="left" valign="middle">SAMTOOLS</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/SAMtools/files/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">ANNOVAR</td>
              <td align="left" valign="middle"><uri>http://www.openbioinformatics.org/annovar/</uri></td>
            </tr>
            <tr>
              <td rowspan="2" align="left" valign="middle">
                <italic>UnifiedGenotyper and ANNOVAR for annotation</italic>
              </td>
              <td align="left" valign="middle">GATK</td>
              <td align="left" valign="middle"><uri>http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">ANNOVAR</td>
              <td align="left" valign="middle"><uri>http://www.openbioinformatics.org/annovar/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>(2.1b) CNVs</bold>
              </td>
              <td align="left" valign="middle"> </td>
              <td align="left" valign="middle"> </td>
            </tr>
            <tr>
              <td rowspan="2" align="left" valign="middle">CNVseq</td>
              <td align="left" valign="middle">CNVseq</td>
              <td align="left" valign="middle"><uri>http://tiger.dbs.nus.edu.sg/cnv-seq/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">R</td>
              <td align="left" valign="middle"><uri>http://www.r-project.org/</uri></td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <italic>SAMTOOLS/ERDS/Sequence variant analyzer v1.0 ERDS</italic>
              </td>
              <td align="left" valign="middle">SVA</td>
              <td align="left" valign="middle"><uri>http://www.svaproject.org/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SAMTOOLS</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/SAMtools/files/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">ERDS</td>
              <td align="left" valign="middle"><uri>http://www.duke.edu/~mz34/erds.htm</uri></td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <italic>CNVer</italic>
              </td>
              <td align="left" valign="middle">CNVer</td>
              <td align="left" valign="middle"><uri>http://compbio.cs.toronto.edu/CNVer/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">BOWTIE</td>
              <td align="left" valign="middle"><uri>http://bowtie-bio.sourceforge.net</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">SAVANT</td>
              <td align="left" valign="middle"><uri>http://compbio.cs.toronto.edu/savant/</uri></td>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>Simulated data generation tool</bold>
              </td>
              <td align="left" valign="middle">dwgsim</td>
              <td align="left" valign="middle"><uri>http://sourceforge.net/projects/dnaa/</uri></td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <sec>
        <title>1.1. Alignment</title>
        <p>The first process in DNA-Seq data analysis involves <italic>alignment</italic> and <italic>assembly</italic>. <italic>Alignment</italic> is the process of mapping DNA-Seq reads to a reference genome. Many sequence alignment software tools that are available today use two main algorithms: the <italic>hash-based</italic> and the <italic>Burrows-Wheeler Transform</italic> methods. </p>
        <p>Some hash-based algorithms build their hash table on the set of input reads (MAQ [<xref ref-type="bibr" rid="B6-genes-03-00545">6</xref>], Illumina’s ELAND unpublished algorithm, SHRiMP [<xref ref-type="bibr" rid="B7-genes-03-00545">7</xref>], ZOOM [<xref ref-type="bibr" rid="B8-genes-03-00545">8</xref>]). Another set of tools build their hash table on the reference genome (SOAPv2 [<xref ref-type="bibr" rid="B9-genes-03-00545">9</xref>], BFAST, <uri>http://genome.ucla.edu/bfast/</uri>, MOSAIK <uri>http://bioinformatics.bc.edu/marthlab/Mosaik/</uri>, Novoalign <uri>http://www.novocraft.com/main/index.php</uri>, PERM [<xref ref-type="bibr" rid="B10-genes-03-00545">10</xref>]). After building the hash-table these methods can either use the reference genome to scan the hash table of input reads, or use the set of input reads to scan the hash table of the reference genome.</p>
        <p>Many recent algorithms rely on the theory of string matching using Burrows-Wheeler Transform (BWT). BWT algorithms (BOWTIE [<xref ref-type="bibr" rid="B11-genes-03-00545">11</xref>], BWA [<xref ref-type="bibr" rid="B12-genes-03-00545">12</xref>], SOAPv2 [<xref ref-type="bibr" rid="B9-genes-03-00545">9</xref>]) typically create a suffix array from the BWT transformed sequence, rather than from the original sequence. In the first step, the sequence order of the reference genome is modified using the BWT, a reversible process (<italic>i.e.</italic>, the original genome sequence can be reconstructed backwards) that reorders the genome grouping together in the data structure the sequences that appear multiple times. Next, the final index is created and is used for rapid read placement on the genome. The main advantage of BWT algorithms is their speed, as they are much faster than hash-based algorithms at the same sensitivity level [<xref ref-type="bibr" rid="B3-genes-03-00545">3</xref>]. </p>
      </sec>
      <sec>
        <title>1.2. Assembly</title>
        <p><italic>Assembly</italic> starts from aligned DNA-Seq reads to reconstruct the original DNA sequence computationally, which generates large, continuous regions of DNA sequence [<xref ref-type="bibr" rid="B3-genes-03-00545">3</xref>]. Many alignment software provide tools to perform the assembly after the read alignment (e.g., MAQ), or standalone resources can be used (SAMTOOLS [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>], Emboss [<xref ref-type="bibr" rid="B14-genes-03-00545">14</xref>]) or commercial packages like Geneious (<uri>http://www.geneious.com</uri>) and CLC-Bio (<uri>http://www.clcbio.com</uri>). For organisms without a sequenced reference genome, it is not possible to perform any reference genome guided assembly of the reads, thus <italic>de novo</italic> assembly is always an essential step for data analysis. The majority of <italic>de novo</italic> assemblers that have been released follow two basic approaches: overlap graphs [<xref ref-type="bibr" rid="B15-genes-03-00545">15</xref>] and de Bruijn graphs [<xref ref-type="bibr" rid="B16-genes-03-00545">16</xref>]. The overlap graph method calculates all the pair-wise overlaps between the reads and report this information in a graph. The manipulation of the same overlap graph leads to a layout of reads and then to a consensus sequence of contigs using Celera Assembler [<xref ref-type="bibr" rid="B17-genes-03-00545">17</xref>] or Arachne [<xref ref-type="bibr" rid="B18-genes-03-00545">18</xref>] among others. This traditional approach is computationally intensive as the overlap graph is extremely large even for simple organisms. De Bruijn graphs algorithm is used by most assemblers (Velvet [<xref ref-type="bibr" rid="B19-genes-03-00545">19</xref>], SOAPdeNOVO [<xref ref-type="bibr" rid="B20-genes-03-00545">20</xref>], ABySS [<xref ref-type="bibr" rid="B21-genes-03-00545">21</xref>]) and reduces the computational charge by breaking reads into smaller sub-sequences of DNA, called k-mers, where the k parameter describes the length in bases of these sequences [<xref ref-type="bibr" rid="B22-genes-03-00545">22</xref>]. The <italic>de novo</italic> assembly can be used also to resolve complex genomic region (e.g., rapidly evolving or rich in repetitive elements) of organisms with a reference genome. In this case the contigs are aligned back to the reference genome and can undergo all the next analytical steps here described.</p>
      </sec>
      <sec>
        <title>1.3. Quality Control Improvement of Reads</title>
        <p>There are many issues that must be considered when dealing with NGS data, beginning with the alignment of short reads. As an example, since each read is aligned independently, many reads spanning Indels may be misaligned. The per-base quality scores (<italic>i.e.</italic>, the probability that the called base in the read is the true base [<xref ref-type="bibr" rid="B23-genes-03-00545">23</xref>]) may also be inaccurate due to systematic errors in sequencing technology, machine cycle, and sequence context [<xref ref-type="bibr" rid="B24-genes-03-00545">24</xref>,<xref ref-type="bibr" rid="B25-genes-03-00545">25</xref>,<xref ref-type="bibr" rid="B26-genes-03-00545">26</xref>]. Thus, following the alignment and/or assembly of reads, quality control steps are implemented before continuing in the downstream analyses (in <xref ref-type="sec" rid="sec2dot1-genes-03-00545">Section 2.1</xref>).</p>
        <sec>
          <title>1.3.1. Basic Quality Control and File Formatting</title>
          <p>A first basic quality control (QC) check involves formatting the aligned reads in a conventional format (e.g., Sequence Alignment/Map (SAM) or Binary Sequence Alignment/Map (BAM)). The output of this process is a clean, sorted, indexed file in BAM format that can be subjected to more advanced QC procedures, or be used directly in the downstream analyses. </p>
        </sec>
        <sec>
          <title>1.3.2. Advanced QC</title>
          <p>Additional advanced QC steps are strongly recommended since misaligned reads and inaccurate quality scores affect the reliability of the subsequent SNP discovery and genotyping steps, without correcting for such stochastic and systemic errors, the rate of false positive calls can be really high [<xref ref-type="bibr" rid="B27-genes-03-00545">27</xref>]. Even though we lack a gold standard for these procedures, there are computational tools to perform advanced QC on the data, in addition to some basic descriptive statistics and quality metrics visualization (SAMTOOLS [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>], PICARD, <uri>http://picard.sourceforge.net/</uri>, Genome Analysis Toolkit (GATK) [<xref ref-type="bibr" rid="B27-genes-03-00545">27</xref>]). GATK supports locally realigning reads across regions enriched in Indels, recalibrating base quality scores of sequencing reads to correct for variation in quality with machine cycle and sequence context. This advanced QC of NGS data is probably the most important process to guarantee an accurate variant call, which is the immediate downstream analytical step.</p>
        </sec>
      </sec>
      <sec>
        <title>1.4. Variant Calling and Annotation</title>
        <p>Once the reads have been aligned and calibrated, SNPs, Indels and CNVs can be called. This step requires sensitive and specific statistical models and tools [<xref ref-type="bibr" rid="B6-genes-03-00545">6</xref>,<xref ref-type="bibr" rid="B9-genes-03-00545">9</xref>,<xref ref-type="bibr" rid="B12-genes-03-00545">12</xref>,<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>,<xref ref-type="bibr" rid="B28-genes-03-00545">28</xref>,<xref ref-type="bibr" rid="B29-genes-03-00545">29</xref>,<xref ref-type="bibr" rid="B30-genes-03-00545">30</xref>,<xref ref-type="bibr" rid="B31-genes-03-00545">31</xref>,<xref ref-type="bibr" rid="B32-genes-03-00545">32</xref>,<xref ref-type="bibr" rid="B33-genes-03-00545">33</xref>,<xref ref-type="bibr" rid="B34-genes-03-00545">34</xref>,<xref ref-type="bibr" rid="B35-genes-03-00545">35</xref>,<xref ref-type="bibr" rid="B36-genes-03-00545">36</xref>,<xref ref-type="bibr" rid="B37-genes-03-00545">37</xref>,<xref ref-type="bibr" rid="B38-genes-03-00545">38</xref>,<xref ref-type="bibr" rid="B39-genes-03-00545">39</xref>,<xref ref-type="bibr" rid="B40-genes-03-00545">40</xref>,<xref ref-type="bibr" rid="B41-genes-03-00545">41</xref>,<xref ref-type="bibr" rid="B42-genes-03-00545">42</xref>,<xref ref-type="bibr" rid="B43-genes-03-00545">43</xref>,<xref ref-type="bibr" rid="B44-genes-03-00545">44</xref>,<xref ref-type="bibr" rid="B45-genes-03-00545">45</xref>,<xref ref-type="bibr" rid="B46-genes-03-00545">46</xref>,<xref ref-type="bibr" rid="B47-genes-03-00545">47</xref>,<xref ref-type="bibr" rid="B48-genes-03-00545">48</xref>], named in <xref ref-type="table" rid="genes-03-00545-t001">Table 1</xref>.</p>
        <sec>
          <title>1.4.1. SNPs and Indels Calling and Annotation</title>
          <p>There are many algorithms that may be used to call SNPs from NGS data (SAMTOOLS [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>], GATK, MAQ, SOAPv2, UnifiedGenotyperV2 within the GATK suite) and some recommended analytical and statistical frameworks [<xref ref-type="bibr" rid="B49-genes-03-00545">49</xref>], even if a gold standard for variant calling is still lacking as statistical methods for analyzing the data are constantly being released [<xref ref-type="bibr" rid="B49-genes-03-00545">49</xref>]. The SNPs and Indels are exported from these tools in variant call format (VCF), with much information related to each variant (<italic>i.e.</italic>, quality score, coverage, estimated genotype). Once the variants have been called they need to be annotated, and this is a step that, until recently, only a few computational tools were able to accomplish like ANNOVAR [<xref ref-type="bibr" rid="B50-genes-03-00545">50</xref>], Sequence Variant Analyzer (SVA) [<xref ref-type="bibr" rid="B51-genes-03-00545">51</xref>], and GATK [<xref ref-type="bibr" rid="B27-genes-03-00545">27</xref>].</p>
        </sec>
        <sec>
          <title>1.4.2. CNVs Calling</title>
          <p>Furthermore, the field of computational methods for discovering structural variation on NGS data is still an open computational and bioinformatics challenge [<xref ref-type="bibr" rid="B2-genes-03-00545">2</xref>]. The CNVs discovery methods operate following a framework that allows detecting anomalous “signatures” or patterns, then calls the related variants using mainly four different approaches [<xref ref-type="bibr" rid="B2-genes-03-00545">2</xref>]: (1) read pair methods; (2) read-depth methods; (3) split read approaches; (4) <italic>de novo</italic> assembly. </p>
        </sec>
      </sec>
      <sec>
        <title>1.5. Statistical and Variant Prioritization Analysis</title>
        <p>Additional software such as PLINKseq (<uri>http://atgu.mgh.harvard.edu/plinkseq/</uri>) implement statistical models to analyze variants called from NGS experiments, testing for association with continuous or dichotomous traits and assessing an unusual distribution for rare variation across different functional categories [<xref ref-type="bibr" rid="B52-genes-03-00545">52</xref>]. Some other tools like PolyPhen2 [<xref ref-type="bibr" rid="B53-genes-03-00545">53</xref>] and VAAST [<xref ref-type="bibr" rid="B54-genes-03-00545">54</xref>] can be used afterwards for functional variant annotation and prioritization providing hints on the biology and pathophysiology of psychiatric disorders. Also, alternative annotation tools and strategies have been proposed [<xref ref-type="bibr" rid="B55-genes-03-00545">55</xref>] to retrieve information and test hypotheses about the functional role of variants present by chance in any single human genome or enriched in the genome of people affected by a psychiatric disease [<xref ref-type="bibr" rid="B56-genes-03-00545">56</xref>,<xref ref-type="bibr" rid="B57-genes-03-00545">57</xref>].</p>
      </sec>
      <sec>
        <title>1.6. Graphical Workflows</title>
        <p>The development and release of algorithms and software for analysis of NGS data has seen exponential growth in the last two years, requiring a huge investment in terms of time, expertise and computer infrastructure. </p>
        <p>From this brief review of tools (<xref ref-type="table" rid="genes-03-00545-t001">Table 1</xref>), it is evident that analyzing NGS data is a challenging and time consuming operation for scientists. Ideally, these tools must be up to date and easy to use, and their sequential combination should optimize performance and accuracy, with each program producing output files compatible with the input requirements of the software performing the following operation. Such processes sequentially linked together build what is generally called a workflow. An increasingly large number of workflows are available today to manage high-throughput genomics sequencing data, from basic data processing to high-quality visualization of results. Examples include shell-scripts [<xref ref-type="bibr" rid="B58-genes-03-00545">58</xref>,<xref ref-type="bibr" rid="B59-genes-03-00545">59</xref>], tool-specific graphical interfaces [<xref ref-type="bibr" rid="B60-genes-03-00545">60</xref>,<xref ref-type="bibr" rid="B61-genes-03-00545">61</xref>], and graphical workflow environments [<xref ref-type="bibr" rid="B62-genes-03-00545">62</xref>,<xref ref-type="bibr" rid="B63-genes-03-00545">63</xref>,<xref ref-type="bibr" rid="B64-genes-03-00545">64</xref>,<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>]. The graphical workflow environment are emerging as useful for constructing, modifying, interconnecting and executing computational genomics protocols using data processing workflows, also described as “pipelines” once the processes have been connected (<xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref>). </p>
        <table-wrap id="genes-03-00545-t002" position="float">
          <object-id pub-id-type="pii">genes-03-00545-t002_Table 2</object-id>
          <label>Table 2</label>
          <caption>
            <p>Comparison of several Graphical Workflow Environments to manage pipelines. Most workflow environments provide graphical solutions (infrastructures) for the interactive handling of data, with several advantageous features compared to the management of the same processes via command line or scripting interfaces. When adding new software tools, some of these architectures require software recompilation and some do not. Yet, there is significant variation of the status reports generated during or after workflow execution. Data storage, internal or external, operating system and local hardware dependencies and utilization of available grid managers also vary between the different workflow environments. There are many synergies between the Pipeline and various alternative environments for software tool integration and interoperability, with also some valuable differences. The Laboratory of Neuro Imaging (LONI) pipeline infrastructure provides computational workflow execution capability with or without the use of local hardware or administrative support. Adding new software tools to the pipeline library of tools is efficient, does not require recompiling the programs, and requires only a brief description of the tool invocation syntax using the client “module description” dialog. Thus, the LONI pipeline offers a flexibility and simplicity in design of novel workflow solutions that is not available in the other two most used systems for NGS data analysis, Taverna and Galaxy. Similarly, the LONI pipeline allows workflow pausing and resuming, and provides explicit controls ensuring that processes are only instantiated when the complete upstream activities have successfully completed execution. Additionally, the available Taverna and Galaxy services have restrictive upper limits on storage (100 GB) and per-process RAM (64 GB), when they are deployed on Amazon Web-Services/Cloud creating bottlenecks with data staging to/from the servers and computational runs. The Pipeline service provides a pair of dedicated open-access servers (<uri>http://genomics.loni.ucla.edu</uri>) each with 40-cores and 1.4 TB of shared RAM.</p>
          </caption>
          <table rules="all" style="border: solid thin">
            <thead>
              <tr style="background: black">
                <th align="center" valign="middle" style="color: white">Workflow Management System</th>
                <th align="center" valign="middle" style="color: white">Module concatenation and interoperability</th>
                <th align="center" valign="middle" style="color: white">Asynchronous Task Management</th>
                <th align="center" valign="middle" style="color: white">Requires Tool Recompiling</th>
                <th align="center" valign="middle" style="color: white">Data Storage</th>
                <th align="center" valign="middle" style="color: white">Platform Independent</th>
                <th align="center" valign="middle" style="color: white">Client-Server Model</th>
                <th align="center" valign="middle" style="color: white">Grid Enabled</th>
              </tr>
            </thead>
            <tbody>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">LONI Pipeline [<xref ref-type="bibr" rid="B57-genes-03-00545">57</xref>]</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">External</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>pipeline.loni.ucla.edu</uri></bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">Taverna [<xref ref-type="bibr" rid="B61-genes-03-00545">61</xref>]</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Internal(MIR)</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>taverna.sourceforge.net</uri></bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">Kepler [<xref ref-type="bibr" rid="B54-genes-03-00545">54</xref>]</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Internal(actors)</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>kepler-project.org</uri> </bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">Triana [<xref ref-type="bibr" rid="B66-genes-03-00545">66</xref>] </td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Internal data structure</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>trianacode.org</uri></bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">Workflow Navigation System [<xref ref-type="bibr" rid="B67-genes-03-00545">67</xref>] </td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N/A</td>
                <td rowspan="2" align="center" valign="middle">External</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
               <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>wns.nig.ac.jp</uri></bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">Galaxy [<xref ref-type="bibr" rid="B55-genes-03-00545">55</xref>]</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">External</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold><uri>usegalaxy.org</uri></bold>
                </td>
              </tr>
              <tr style="border-top: solid thin">
                <td align="left" valign="middle">VisTrails [<xref ref-type="bibr" rid="B55-genes-03-00545">55</xref>]</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">Y</td>
                <td rowspan="2" align="center" valign="middle">Internal</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N</td>
                <td rowspan="2" align="center" valign="middle">N</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                 <uri>www.vistrails.org</uri>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
          <fn>
           <p><italic>Workflow Management System</italic>: list of the compared graphical workflow environments. <italic>Module concatenation and interoperability: Asynchronous Task Management</italic>: ability to submit new workflows and report the status of executing or completed workflows asynchronously, e.g., constant interruptions of network connectivity. <italic>Requires Tool Recompiling</italic>: requirement to recompile new computational libraries or software tools against the graphical environment libraries, and to restart the environment when provisioning these new services. <italic>Data Storage</italic>: ability of environment to store data (raw, processed and derived) internally (RAM/DB) or externally (NFS/Services). <italic>Platform Independent</italic>: dependency of the environment on the local hardware and operating system. <italic>Platform independence refers to the workflow environment itself, not the computational library of tools that are accessible via that environment. For environments with Client-Server architecture, this is irrelevant, as the platform-independent clients can always connect to, submit data, process protocols and monitor the status of executing pipeline workflows by connecting to (possibly platform dependent) back-end pipeline servers where specific operating systems (most commonly Linux) may be required by many informatics and genomics computing libraries. Client-Server Model</italic>: independent server and clients that can be broadly interconnected provided. <italic>Grid Enabled</italic>: use of a Grid Engine/Grid Job Manager. Legend: Y = yes, N = no. </p>
          </fn>
          </table-wrap-foot>
        </table-wrap>
       
        <p>There are several additional features in graphical workflow environments that simplify the data management. <xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref> lists some of the commonly used workflow environments and compares their core features. Each of the graphical environments described in <xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref> allows design and submission of new workflows. </p>
        <p>The most commonly used systems in NGS analysis are Taverna [<xref ref-type="bibr" rid="B68-genes-03-00545">68</xref>] and Galaxy [<xref ref-type="bibr" rid="B64-genes-03-00545">64</xref>]. The latest beta-version of the Galaxy (<uri>http://galaxy.psu.edu/</uri>) platform offers a NGS computational framework that embeds single processing units to be invoked as web-services but it still lacks the functionality of interlinking the outputs of one process into a subsequent module. The available Taverna and Galaxy services have restrictive upper limits on storage and per-process RAM, when they are deployed on web-services, creating bottlenecks with data staging to/from the servers and computational runs. Also, the library of available Galaxy routines doesn’t allow adding new tools and is limited to a few alignment software tools (e.g., BWA and Bowtie), or on the quality control side, an incomplete beta-version of the PICARD suite. </p>
        <p>The LONI (Laboratory of Neuro Imaging) pipeline displays unique features for the implementation of new interactive and robust NGS analysis workflow protocols using a graphical environment. The design of novel workflow solutions within the LONI pipeline environment is simple and flexible [<xref ref-type="bibr" rid="B69-genes-03-00545">69</xref>], and allows embedding and connecting heterogeneous software within the same computational protocol without the need of advanced bioinformatics skills (<xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref>).</p>
        <p>The LONI pipeline architecture [<xref ref-type="bibr" rid="B70-genes-03-00545">70</xref>,<xref ref-type="bibr" rid="B71-genes-03-00545">71</xref>] is a distributed environment utilizing a client-server interface for design, validation, execution and dissemination of computational protocols as graphical workflows. Individual applications are represented as modules and may be linked to form complex network implementation of the desired analytical processes. Using a flexible, user friendly and customizable data processing and visualization system, the LONI pipeline environment provides access to distributed datasets, heterogeneous software tools and diverse web-services. Additional details about the LONI pipeline environment are available in [<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>].</p>
        <p>Based upon the review of the NGS software (<xref ref-type="table" rid="genes-03-00545-t001">Table 1</xref>) and the graphical workflow environments to manage workflow (<xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref>), we chose to develop within the LONI environment a graphical environment for genomics, called the Graphical Pipeline for Computational Genomics (GPCG) that covers many informatics analytical steps on NGS data. This effort was a joint collaboration between LONI at UCLA, BIRN (Biomedical Informatics Research Network) at UCI, Information Sciences Institute (ISI) at USC.</p>
        <p>The GPCG is a set of workflows that may simplify and speed-up approaches for sequencing projects performed on Illumina/Solexa Genome Analyzer-HiSeq platform [<xref ref-type="bibr" rid="B72-genes-03-00545">72</xref>,<xref ref-type="bibr" rid="B73-genes-03-00545">73</xref>,<xref ref-type="bibr" rid="B74-genes-03-00545">74</xref>]. We have implemented workflows in GPCG that: (1.1) aligns reads (both in single and paired end) and (1.2) performs <italic>de novo</italic> assembly with multiple algorithms; (1.3a) performs basic formatting and quality control, followed by a (1.3b) more advanced and complex quality control to correct for sequencing biases; (2.1a) performs SNP-Indels calling and annotation, and (2.1b) CNVs calling. The <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1</xref> provides detailed information about our experiments and results. </p>
      </sec>
    </sec>
    <sec>
      <title>2. Description of the GPCG</title>
      <p>The GPCG is a collection of “ready to use” workflows covering a broad spectrum of DNA-Seq data analysis steps. We have converted each single “command line” process into a module definition, and then connected those modules involved in the same process logically to form a workflow (<xref ref-type="fig" rid="genes-03-00545-f001">Figure 1</xref>). We have generated a comprehensive <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials Chapter S2</xref> that includes all details to deploy the GPCG infrastructure, reproduce the workflow designs, and validate the results reported in this paper. The full description of all the workflows is reported in <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1</xref>. In the data source section user can specify the location of input data, a process section where modules can be linked to perform complex analysis steps, and an output section where the name and location of the output files can be specified (<xref ref-type="fig" rid="genes-03-00545-f001">Figure 1</xref>).</p>
      
      <p>A single workflow (<xref ref-type="fig" rid="genes-03-00545-f001">Figure 1</xref>) can be run independently from others, or can be connected as illustrated by the analytical workflow protocols (<xref ref-type="fig" rid="genes-03-00545-f002">Figure 2</xref> and <xref ref-type="table" rid="genes-03-00545-t003">Table 3</xref>). For example, the user can reconnect workflows by dragging and dropping within the GPCG creating a new pipeline. Modularity, reuse and interconnectivity are key features within the LONI environment, which make the system flexible for the analytical needs of the researcher.</p>
     
      
      <p>The current implementation of DNA-Seq workflows is summarized in <xref ref-type="table" rid="genes-03-00545-t003">Table 3</xref>. Together with its flexible connections, the GPCG incorporates alternative algorithms for the vast majority of the processes. The choice of the algorithm is critical, as different approaches might produce different results. </p>
      <p>Users may choose in fact the most suitable analytical model for their data or develop additional module descriptions interfacing other computational tools. Additional tools, workflows and updates are available on the LONI pipeline Navigator website: <uri>http://pipeline.loni.ucla.edu/services/library-navigator</uri>. As a large number of different file formats are involved in these workflows, we provide a glossary and examples of all the formats encountered in <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Table S1</xref>.</p>
      <p>We have implemented a preprocessing module that allows extracting a subset of reads to perform a validation and initial testing of pipeline modules before running an entire dataset. Embedded within this module are routines to convert read quality scores from Solexa FASTQ files to the Sanger scale (for data produced with Illumina pipeline versions previous than 1.8), and to binary FASTQ as requested by some aligners like MAQ [<xref ref-type="bibr" rid="B40-genes-03-00545">40</xref>,<xref ref-type="bibr" rid="B75-genes-03-00545">75</xref>]. Once the read subset is ready, it is possible to proceed to the two main computational processes previously described. We also embedded another module in the pipeline, “dwgsim”, that allows the user to generate simulated read datasets according to their analytical needs (see <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1, Figure A1b, and Supplementary Materials S2</xref>).</p>
      <fig id="genes-03-00545-f001" position="float">
        <label>Figure 1</label>
        <caption>
          <p>A snapshot of the general organization of a workflow within the LONI pipeline environment. This is an example of embedded modules into an alignment workflow based on BWA software. The user can simply set up the location of the input files in the data sources, manage the programs involved in the core modules, and indicate the location of the output files in the output data sink section. Every section can be interactively edited or modified through a menu of options accessed by right-clicking the mouse on the respective portion of the workflow.</p>
        </caption>
        <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g001.tif"/>
      </fig>
      <fig id="genes-03-00545-f002" position="float">
        <label>Figure 2</label>
        <caption>
          <p>An example of the workflow approach to analyze DNA-Seq data in GPCG. Several alternative workflows can be run independently or connected in a logical flow. Once the reads have been pre-processed, they can be aligned (1.1), undergo (1.3) Basic and (1.4) Advanced QC, (2.1a) SNP/Indels and (2.1b) CNVs calling and annotation. The reads can also undergo (1.2) <italic>de novo</italic> assembly, and if a reference genome is available the contigs can be realigned back to the reference genome and then undergo the following computational processes. </p>
        </caption>
        <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g002.tif"/>
      </fig>
       <table-wrap id="genes-03-00545-t003" position="float">
        <object-id pub-id-type="pii">genes-03-00545-t003_Table 3</object-id>
        <label>Table 3</label>
        <caption>
          <p>Review of the processes and related workflows currently implemented in the NGS Pipeline. All processes and workflows have been tested and validated and are available for use by interested scientists. A single pipeline can be run independently from others, or can be connected as illustrated by the analytical workflow protocols described in this Table.</p>
        </caption>
        <table rules="all" style="border: solid thin">
          <thead>
            <tr style="background: black">
              <th align="center" valign="middle" style="color: white">Process</th>
              <th align="center" valign="middle" style="color: white">Process Description</th>
              <th align="center" valign="middle" style="color: white">Software &amp; Algorithms</th>
              <th align="center" valign="middle" style="color: white">Input *</th>
              <th align="center" valign="middle" style="color: white">Output (Files)</th>
              <th align="center" valign="middle" style="color: white">Upstream Module Dependencies</th>
              <th align="center" valign="middle" style="color: white">Downstream Module Dependencies</th>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td align="left" valign="middle">
                <bold>Preprocessing step</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Test the NGS raw data and functionality</bold>
              </td>
              <td align="left" valign="middle">homemade script</td>
              <td align="left" valign="middle">reads (original solexa format)</td>
              <td align="left" valign="middle">subset of reads (fastq format)</td>
              <td align="left" valign="middle">
                <bold>none </bold>
              </td>
              <td align="left" valign="middle">
                <bold>(1.1) Alignment, (1.2) <italic>De novo</italic> assembly</bold>
              </td>
            </tr>
            <tr>
              <td rowspan="8" align="left" valign="middle">
                <bold>(1.1) Alignment</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Mapping the reads to the reference genome</bold>
              </td>
              <td align="left" valign="middle">MAQ</td>
              <td align="left" valign="middle">reads (binary fastq format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle">
                <bold>Preprocessing</bold>
              </td>
              <td align="left" valign="middle">
                <bold>(1.2) Basic QC</bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">BWA</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">BWA-SW (SE only)</td>
              <td align="left" valign="middle">reads (solexa format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">PERM</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">BOWTIE</td>
              <td align="left" valign="middle">reads (solexa format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">SOAPv2</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">MOSAIK</td>
              <td align="left" valign="middle">reads (solexa format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">NOVOALIGN</td>
              <td align="left" valign="middle">reads (solexa format)</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <bold>(1.2) <italic>De novo</italic> assembly</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Build a <italic>de novo</italic> genome sequence</bold>
              </td>
              <td align="left" valign="middle">VELVET</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">contigs file</td>
              <td align="left" valign="middle">
                <bold>Preprocessing</bold>
              </td>
              <td align="left" valign="middle">
                <bold>(1.1) Alignment or none <sup>$</sup></bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">SOAPdenovo</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">contigs file</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">ABYSS</td>
              <td align="left" valign="middle">reads (fastq format)</td>
              <td align="left" valign="middle">contigs file</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>(1.3) Basic QC</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Basic Data formatting and quality control</bold>
              </td>
              <td align="left" valign="middle">PICARD, SAMTOOLS</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle">BAM</td>
              <td align="left" valign="middle">
                <bold>(1.1) Alignment</bold>
              </td>
              <td align="left" valign="middle">
                <bold>(1.4) Advanced QC</bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>(1.4) Advanced QC</bold>
              </td>
              <td align="left" valign="middle">
                <bold>QC for advanced issues</bold>
              </td>
              <td align="left" valign="middle">PICARD, SAMTOOLS, GATK</td>
              <td align="left" valign="middle">BAM</td>
              <td align="left" valign="middle">BAM clean</td>
              <td align="left" valign="middle">
                <bold>(1.3) Basic QC</bold>
              </td>
              <td align="left" valign="middle">
                <bold>(2.1a) Variant calling (2.1b) CNV analysis</bold>
              </td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <bold>(2.1a) Variant calling and annotation</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Identify and visualize SNPs and Indels from the whole genome</bold>
              </td>
              <td align="left" valign="middle">Sequence Variant Analyzer v1.0</td>
              <td align="left" valign="middle">BAM clean</td>
              <td align="left" valign="middle">csv files with variants and annotation </td>
              <td align="left" valign="middle">
                <bold>(1.4) Advanced QC</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Statistical analysis and visualization software <sup>#</sup></bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">SAMTOOLS and ANNOVAR for annotation</td>
              <td align="left" valign="middle">BAM clean</td>
              <td align="left" valign="middle">txt files with variants</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle"/>
            </tr>
            <tr>
              <td align="left" valign="middle"> </td>
              <td align="left" valign="middle">Unified genotyper and ANNOVAR for annotation</td>
              <td align="left" valign="middle">BAM clean</td>
              <td align="left" valign="middle">txt files with variants</td>
              <td align="left" valign="middle"> </td>
              <td align="left" valign="middle"> </td>
            </tr>
            <tr>
              <td rowspan="3" align="left" valign="middle">
                <bold>(2.1b) CNVs calling</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Analysis of CNVs (ins &amp; del &gt; 1 Kb)</bold>
              </td>
              <td align="left" valign="middle">BOWTIE CNVer SAVANT </td>
              <td align="left" valign="middle">reads (solexa format)</td>
              <td align="left" valign="middle">txt file with the CNVs calls</td>
              <td align="left" valign="middle">
                <bold>(1.4) Advanced QC</bold>
              </td>
              <td align="left" valign="middle">
                <bold>Statistical analysis and visualization software <sup>#</sup></bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">CNVseq</td>
              <td align="left" valign="middle">SAM</td>
              <td align="left" valign="middle">txt file with the CNVs calls</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">
                <bold>R (stat software)</bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">SAMTOOLS ERDS Sequence variant analyzer ERDS v1.0</td>
              <td align="left" valign="middle">BAM clean</td>
              <td align="left" valign="middle">csv file with the CNVs calls</td>
              <td align="left" valign="middle"/>
              <td align="left" valign="middle">
                <bold>Statistical analysis and visualization software <sup>#</sup></bold>
              </td>
            </tr>
            <tr>
              <td align="left" valign="middle">
                <bold>Simulated data generation tool</bold>
              </td>
              <td align="left" valign="middle">Generate simulated reads according to the needs of the user</td>
              <td align="left" valign="middle">dwgsim</td>
              <td align="left" valign="middle">-</td>
              <td align="left" valign="middle">SE or PE .fastq files </td>
              <td align="left" valign="middle">-</td>
              <td align="left" valign="middle">
                <bold>(1.1) Alignment, (1.2) <italic>De novo</italic> assembly</bold>
              </td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
        <fn>
        <p>* With solexa format we refer to the Phred quality score code used by the Illumina Pipeline version prior than 1.8 (Phred +64). The newer versions of the Illumina Pipeline produce reads file directly in Sanger format (Phred +33). To guarantee backwards compatibility with data produced by version of the Illumina Pipeline previous than 1.8 we have embedded a conversion step from Solexa FASTQ to Sanger FASTQ for the alignment software that don’t support the solexa format. The user can remove this step in case the conversion is not needed; <sup>#</sup> External software like PLINQseq for the statistical analysis or IGV for visualization are not embedded in the workflow; <sup>$</sup> If a reference genome is not available the contigs can be used like they are for further analysis. If a reference genome is available the contigs can be aligned back to the reference genome with BWA-SW.</p>
        </fn>
        </table-wrap-foot>
      </table-wrap>
      
      <p>The GPCG pipeline environment allows users to change execution parameters for each software tool represented as a module in the pipeline workflow. This architecture allows changing parameters, adding new parameters or execution controls (e.g., flags, options), modifying values of existent parameters, removing parameters, as well as inserting new processing modules or connections to/from data objects (data-sources/inputs or data-sinks/outputs). These workflow modifications are accomplished directly in the pipeline graphical interface by using mouse-selection and keyboard entries.</p>
      <sec id="sec2dot1-genes-03-00545">
        <title>2.1. Alignment</title>
        <p>We incorporated into the LONI pipeline some of the most used alignment software tools (<xref ref-type="fig" rid="genes-03-00545-f003">Figure 3</xref>). Through the “edit module” functionality of the graphical user interface (GUI) it is possible to visually manage all the parameters of the alignment software (e.g., number of allowed mismatches, read trimming, gap extension) encapsulated by the module itself. Each algorithm in a workflow (other than BWA-SW that runs only in single end) has a switch that allows the user to perform an alignment on either single end (SE) or paired end (PE) reads. The input reads are generally in FASTQ format. BWA accepts input read sequences also in BAM format. For paired-end data the pair reads need to be grouped together in one single BAM file. The final output is an alignment file is in SAM format (see <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1, Figure A2</xref>). This file is ready to undergo quality control procedures. The choice of the alignment software is critical as different software might lead to different results in term of mapping and downstream variants calling. It is strongly suggested to perform the alignment using more than one software and identify strategies that may provide better fit for the user’s data. BWA and BOWTIE, for example, are among the fastest alignment algorithms for both single and paired end reads, and are particularly suitable for whole genome alignments. They generally report by only one alignment hit, and in the default mode do not guarantee finding the best hit nor if the found hit is unique. In particular, for users dealing with highly repetitive genomic regions, it is possible to force these algorithms to output all the possible hits with a high cost in terms of speed, or to use a probabilistic software like PERM that outputs all the found hits. The user could perform a <italic>de novo</italic> assembly (see next paragraph) for a better resolution in such regions. If the main interest is the detection of CNVs with paired-end reads, Novoalign is strongly recommended because of the high mapping accuracy, even if slower than the previously mentioned Burrows-Wheeler (BW) based software.</p>
        
        <p>For selecting parameters settings, users can find hints about the meaning and possible values of each single parameters by double-clicking on a parameter icon or via the execution info menu-accessible as a right-click on the module of interest. For the parameters tuning the users have to refer to the software documentation as all the algorithms works differently. The LONI Pipeline workflow environment enhances the user’s abilities to optimize the selection of the parameters for each tool by allowing specification of a list containing a range of parameters spanning the support of each parameter of interest. This <italic>parameter-optimization</italic> strategy is exactly the basis of the development of meta-algorithms as workflow protocols [<xref ref-type="bibr" rid="B76-genes-03-00545">76</xref>,<xref ref-type="bibr" rid="B77-genes-03-00545">77</xref>,<xref ref-type="bibr" rid="B78-genes-03-00545">78</xref>].</p>
        <p>The quality control process we used to determine the tools to be incorporated is based on an up to date and detailed investigation of the available NGS tools, including research papers with technical and analytical details, and the online discussions panel/forums if available to better follow the analytical trends and open issues. The testing with simulated and real data provided us further hints into the software reliability. Some information and guidelines to help in identifying the best set of software to analyze different type of data can be found in some about reference with comparisons among tools we provided [<xref ref-type="bibr" rid="B22-genes-03-00545">22</xref>,<xref ref-type="bibr" rid="B79-genes-03-00545">79</xref>].</p>
        <fig id="genes-03-00545-f003" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Schematic representation of alignment modules available for both single and paired-end data (2.1).</p>
          </caption>
          <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g003.tif"/>
        </fig>
      </sec>
      <sec>
        <title>2.2. Assembly</title>
        <p>When a reference genome is not available for the species under investigation it is necessary to perform <italic>de novo</italic> assembly of the reads. <italic>De novo</italic> assembly is also the suggested approach for reads mapping in regions prone to rearrangements, rapidly evolving, or where the reference genome might not be informative. We have developed three workflows embedding the most common <italic>de novo</italic> assembly software based on de Bruijn graphs (<xref ref-type="fig" rid="genes-03-00545-f004">Figure 4</xref>), which works both with single and paired end data. One of the most important parameters to be tuned in <italic>de novo</italic> assembly is the fixed-length of the subsequences used to build the graphs length, which is abbreviated as k-mer. The most efficient k-mer size for a particular assembly is determined by the read length as well as the error rate, and it may be estimated <italic>a priori</italic> using “ad hoc” tools provided by the software developer or by evaluating the results obtained through varying the k-mer size across different runs. Some assemblers are also able to output scaffolds (<italic>i.e.</italic>, a set of contigs with known relative orientation and distance) or supercontigs (<italic>i.e.</italic>, contigs in which gaps are allowed). When <italic>de novo</italic> assembly is used in complex region of organisms with a known reference genome, the contigs can be aligned back to it with the BWA-SW workflow, and then undergo the subsequent analytical steps.</p>
        <fig id="genes-03-00545-f004" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Schematic representation of the <italic>de novo</italic> assembly workflows available (2.2).</p>
          </caption>
          <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g004.tif"/>
        </fig>
      </sec>
      <sec>
        <title>2.3. Quality Control Improvement of Reads</title>
        <sec>
          <title>2.3.1. Basic Quality Control and File Formatting</title>
          <p>After the alignment the reads undergo first basic formatting and quality controls steps. We have developed a “Basic QC” workflow in line with the consolidated and updated procedures for DNA-Seq analyses (<xref ref-type="fig" rid="genes-03-00545-f005">Figure 5</xref>).</p>
          
          <p>The output of this module is a clean, sorted, indexed BAM file that can undergo advanced QC procedures, or used as it is for downstream analyses.</p>
        </sec>
        <sec>
          <title>2.3.2. Advanced QC</title>
          <p>Additional advanced QC issues can be addressed and fixed using the GATK toolkit (<uri>http://www.broadinstitute.org/gsa/wiki</uri>) [<xref ref-type="bibr" rid="B27-genes-03-00545">27</xref>,<xref ref-type="bibr" rid="B80-genes-03-00545">80</xref>]. The GATK developers suggested a core analytical framework that includes local realignment around Indels and base quality score recalibration [<xref ref-type="bibr" rid="B27-genes-03-00545">27</xref>,<xref ref-type="bibr" rid="B80-genes-03-00545">80</xref>]. We have embedded these processes in the “Advanced QC” workflows, adding other modules to produce quality control plots, statistics, and tracks useful for the visualization of the data (<xref ref-type="fig" rid="genes-03-00545-f006">Figure 6</xref>).</p>
          <fig id="genes-03-00545-f005" position="float">
            <label>Figure 5</label>
            <caption>
              <p>A snapshot of the general organization of the Basic QC workflow (2.1.3). After an initial file cleaning that performs various fix ups, the alignment file in Sequence Alignment/Map (SAM) format is converted in Binary Sequence Alignment/Map (BAM) file and sorted. The workflow takes care of the duplicated reads removing or marking the potential PCR duplicates. If multiple read pairs have identical external coordinates, it only retains the pair with highest mapping quality. This step is particularly suited for paired end data and the user can switch between the two options simply changing the REMOVE_DUPLICATES argument in the GUI related to this module. The removal step can be excluded from a workflow run depending on the interest in studying repetitive elements. In case of paired end reads, the pipeline then ensures that all mate-pair information is in sync between each read and its mate pair, fixing any incoherent information. The BAM file undergoes MD tagging that adds string, labeling the mismatching positions. The BAM is finally indexed using the index of the reference genome.</p>
            </caption>
            <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g005.tif"/>
          </fig>
          <fig id="genes-03-00545-f006" position="float">
            <label>Figure 6</label>
            <caption>
              <p>A snapshot of the general organization of the Advanced QC workflow (2.1.4). (<bold>A</bold>) After the basic QC, the reads that map within Indels in the individual’s genome compared to the reference genome are locally realigned, as they may lead to alignment artifacts that can easily be misinterpreted as SNPs. The next step is the base quality score recalibration to recalibrate base quality scores of reads, by the analysis of the covariation among several features of a base (e.g., reported quality scores, the position within the read). The workflow produces plots and tables with the most important metrics for a DNA-Seq experiment (<italic>i.e.</italic>, mean quality by cycle, insert size metrics, quality score distribution, GC-bias metrics, main alignment metrics) with the PICARD software; (<bold>B</bold>) The users can then produce useful tracks for the visualization of the data in Integrative Genome Viewer (IGV). Examples are the (a) callability track (<italic>i.e</italic>., evaluates how much a region can be trusted in term of coverage, accuracy and quality by GATK and can be visualized as a bar chart in IGV); (b) the sliding window coverage (<italic>i.e.</italic>, a computation of average alignment over a specified window size across the genome with igvtools). The main outputs of this step are: (1) a cleaned BAM file ready to be used for variant calling, (2) a set of plots and text files that can help the user to have a general picture about the general quality of the experiment and (3) a set of track files to visualize the dataset and its features. The user can upload the indexed BAM files and these tracks in IGV to visualize and annotate the reads across the whole genome with user-produced or online tracks (RefSeq, RepeatMasker, Database of Genomic Variants).</p>
            </caption>
            <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g006.tif"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>2.4. Variant Calling and Annotation</title>
        <sec id="sec2dot4dot1-genes-03-00545">
          <title>2.4.1. SNPs and Indels Calling and Annotation</title>
          <p>We explored and embedded in workflow at least three different frameworks to call SNPs and Indels from whole genome alignment data and produce a comprehensive mutation/functional analysis report (<xref ref-type="fig" rid="genes-03-00545-f007">Figure 7</xref>).</p>
          
          <p>For the Sequence Variant Analyzer (SVA, <uri>http://www.svaproject.org</uri> [<xref ref-type="bibr" rid="B51-genes-03-00545">51</xref>]) v1.0 workflow, after SNPs and Indels have been called with SAMTOOLS [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>] (<uri>http://samtools.sourceforge.net/</uri>) and CNVs have been called with “Estimation by Read Depth with Single Nucleotide Variants” (ERDS) software v1.02 (<uri>http://web.duke.edu/~mz34/erds.htm</uri>), they undergo annotation and visualization through SVA (<xref ref-type="fig" rid="genes-03-00545-f007">Figure 7</xref>). SVA is a visualization platform for performing statistical analysis and filtering procedures as well. The workflow we have developed allows users to produce a .gsap file, which can be loaded into SVA to create a project with single or multiple annotated genomes. This version of SVA is linked to ENSEMBL hg18 annotations.</p>
          <p>Variants detected with SAMTOOLS [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>] can be also exported as a VCF file (Table S1) and comprehensively annotated through ANNOVAR [<xref ref-type="bibr" rid="B50-genes-03-00545">50</xref>] (<xref ref-type="fig" rid="genes-03-00545-f007">Figure 7</xref>). This software allows functional annotation of genetic variants detected from diverse genomes (human genome hg18, hg19, as well as mouse, worm, fly, yeast and many others). In particular the last release of ANNOVAR retrieves variant calls and frequency information from the 1000 Genomes Project [<xref ref-type="bibr" rid="B81-genes-03-00545">81</xref>] or from the sixty genomes released by Complete Genomics [<xref ref-type="bibr" rid="B30-genes-03-00545">30</xref>]. ANNOVAR offers three different annotation options: <italic>gene-based</italic>, <italic>region-based</italic> or <italic>filter-based annotation</italic>, and all the three options are implemented.</p>
          <fig id="genes-03-00545-f007" position="float">
            <label>Figure 7</label>
            <caption>
              <p>A snapshot of the three independent workflow for variant calling, and annotation workflows available. Sequence Variant Analyzer (SVA) displays a graphical user interface (GUI) to visualize, annotate, filter and analyze the called variants.</p>
            </caption>
            <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g007.tif"/>
          </fig>
          <p>In the GPCG we also embedded the UnifiedGenotyperV2 from the GATK suite, which is a popular software to simultaneously call SNPs and Indels and produce a VCF output file [<xref ref-type="bibr" rid="B13-genes-03-00545">13</xref>] (<xref ref-type="fig" rid="genes-03-00545-f007">Figure 7</xref>). We connected this module with ANNOVAR to perform the complete annotation of variants, as previously described.</p>
        </sec>
        <sec>
          <title>2.4.2. CNVs Calling</title>
          <p>While general agreement on the best analytical strategy is still lacking [<xref ref-type="bibr" rid="B2-genes-03-00545">2</xref>], we have implemented some workflows, as a first wave of a more comprehensive set of tools. Among the many different approaches to call CNVs, we have chosen the three approaches described in <xref ref-type="fig" rid="genes-03-00545-f008">Figure 8</xref>.</p>
          
          <p>CNVer relies both on read depth and read pair information [<xref ref-type="bibr" rid="B48-genes-03-00545">48</xref>] in a computational framework called the donor graph, that reduces the sequencing biases causing uneven local coverage (<xref ref-type="fig" rid="genes-03-00545-f008">Figure 8</xref>). The most interesting feature of CNVer is the ability to compute the absolute copy counts of segments of the donor genome, and work with low coverage datasets. Moreover CNVer allows detecting CNVs without the need of a reference genome. The CNVs called by CNVer may be imported and visualized in the SAVANT genome browser [<xref ref-type="bibr" rid="B82-genes-03-00545">82</xref>] (<xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1, Figure A18</xref>).</p>
          <fig id="genes-03-00545-f008" position="float">
            <label>Figure 8</label>
            <caption>
              <p>A snapshot of the general organization of the CNVs modules. ERDS, CNVer and CNVseq have been implemented as a first wave of tools to call CNVs in DNA-Seq data.</p>
            </caption>
            <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g008.tif"/>
          </fig>
          <p>CNVseq is a read depth method that detects CNVs with a robust statistical model conceptually derived from the aCGH (array Comparative genomic hybridization) analytical framework. [<xref ref-type="bibr" rid="B47-genes-03-00545">47</xref>]. CNVseq uses a sequence as a template and two sets of reads, one set from a reference individual (e.g., the individual expected to show a normal ploidy) and one from the test individual we want to screen for CNVs<italic>.</italic> The two sets of reads are aligned to a template genome, and then with a sliding window approach, CNVs are detected by computing the number of reads for each individual in each sliding window, yielding ratios and copy number estimations. Additional steps with this approach are performed in R (<uri>http://www.r-project.org/</uri>), with the possibility to get a full visualization of the log<sub>2</sub> allelic ratio profiles and statistics on the detected CNVs (<xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1, Figure A20</xref>).</p>
          <p>The final CNV workflow method is ERDS, which is a Hidden Markov Model (HMM) based approach that relies on read depth to infer the copy number state. It represents an extension of the method described in Bentley <italic>et al</italic>. (2008) [<xref ref-type="bibr" rid="B31-genes-03-00545">31</xref>], and described more in detail by Pelak <italic>et al</italic>. (2010) [<xref ref-type="bibr" rid="B45-genes-03-00545">45</xref>]. Expected read depth is calculated using the expectation maximization (EM) approach and corrected by GC bias. The ERDS functionality can be found embedded into the already described SVA 1.0 workflow for SNPs and Indels calling (<xref ref-type="sec" rid="sec2dot4dot1-genes-03-00545">Section 2.4.1</xref>) can then be used to visually inspect and annotate the CNVs (<xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S1, Figure A21</xref>).</p>
        </sec>
      </sec>
      <sec>
        <title>2.5. Evaluation with Simulated Data</title>
        <p>We evaluated all the previously described GPCG workflows (<xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref> and <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S2</xref>) with 30 million simulated reads (both SE and PE) generated with dwgsim simulation tool, a publically available utility for simulating whole-genome Illumina reads [<xref ref-type="bibr" rid="B83-genes-03-00545">83</xref>]. Runtimes and information about the performances of the workflow with simulated data are summarized in <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Table S2</xref>. We ran all the workflows with default parameters. </p>
        <p>We reported for all the modules a better time performance of the GPCG compared to Galaxy, together with the absence for GPCG of the time required for the data upload.</p>
        <p>Using the same simulated data files, we tested the common modules embedded both in our GPCG and on the Galaxy webserver interface. Due to the structure of Galaxy and the reduced number of processes available, we couldn’t compare its performances on a workflow scale. The module shared by GPCG and Galaxy were: conversion of solexa into sanger format, BWA and Bowtie paired end alignment, the PICARD utilities to fix the mate information, mark the duplicates, collect the alignment, GC bias, and insert size metrics. We reported the GPCG and Galaxy time performances in <xref ref-type="table" rid="genes-03-00545-t004">Table 4</xref>. </p>
        <table-wrap id="genes-03-00545-t004" position="float">
          <object-id pub-id-type="pii">genes-03-00545-t004_Table 4</object-id>
          <label>Table 4</label>
          <caption>
            <p>Runtimes and performances on simulated data for modules in common across Graphical Pipeline for Computational Genomics (GPCG) and Galaxy. The performances of GPCG in terms of run time were better than Galaxy for all the tested modules.</p>
          </caption>
                 <table rules="all" style="border: solid thin">
            <thead>
              <tr style="background: black">
                <th align="left" valign="middle" style="color: white">Analytical category</th>
                <th align="center" valign="middle" style="color: white">Input file(file size)</th>
                <th align="center" valign="middle" style="color: white">Job description</th>
                <th align="center" valign="middle" style="color: white">GPCG workflow name</th>
                <th align="center" valign="middle" style="color: white">Time</th>
                <th align="center" valign="middle" style="color: white">Galaxy module name</th>
                <th align="center" valign="middle" style="color: white">Time</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="left" valign="middle">
                  <bold>Data upload</bold>
                </td>
                <td align="center" valign="middle">2.4 Gb × 2 (PE)</td>
                <td align="center" valign="middle">Upload of the data into the webserver</td>
                <td align="center" valign="middle">(N/A)</td>
                <td align="center" valign="middle">(N/A)</td>
                <td align="center" valign="middle">Upload of the data</td>
                <td align="center" valign="middle">180 min</td>
              </tr>
              <tr>
                <td align="left" valign="middle">
                  <bold>Preprocessing</bold>
                </td>
                <td align="center" valign="middle">2.4 Gb fastq file</td>
                <td align="center" valign="middle">Conversion of solexa into sanger format</td>
                <td align="center" valign="middle">Preprocessing pipeline: sol2sanger</td>
                <td align="center" valign="middle">6 min</td>
                <td align="center" valign="middle">FASTQ Groomer</td>
                <td align="center" valign="middle">45 min</td>
              </tr>
              <tr>
                <td rowspan="2" align="left" valign="middle">
                  <bold>Alignment</bold>
                </td>
                <td align="center" valign="middle">2.4 Gb × 2 fastq files (PE)</td>
                <td align="center" valign="middle">BWA paired end alignment with default parameters</td>
                <td align="center" valign="middle">BWA PE (1.1)</td>
                <td align="center" valign="middle">132 min</td>
                <td align="center" valign="middle">Map with BWA for Illumina</td>
                <td align="center" valign="middle">240 min</td>
              </tr>
              <tr>
                <td align="center" valign="middle">2.4 G × 2 fastq files (PE)</td>
                <td align="center" valign="middle">Bowtie paired end alignment with default parameters</td>
                <td align="center" valign="middle">BOWTIE PE (1.1)</td>
                <td align="center" valign="middle">205 min</td>
                <td align="center" valign="middle">Map with Bowtie for Illumina</td>
                <td align="center" valign="middle">270 min</td>
              </tr>
              <tr>
                <td rowspan="5" align="left" valign="middle">
                  <bold>Quality control (metrics and cleaning)</bold>
                </td>
                <td align="center" valign="middle">1.6 Gb SAM file</td>
                <td align="center" valign="middle">Synchronization of mate-pair information</td>
                <td align="center" valign="middle">Fix Mate Information (Basic QC, 1.3)</td>
                <td align="center" valign="middle">6 min</td>
                <td align="center" valign="middle">Paired Read Mate Fixer for paired data</td>
                <td align="center" valign="middle">30 min</td>
              </tr>
              <tr>
                <td align="center" valign="middle">1.6 Gb SAM file</td>
                <td align="center" valign="middle">Marks duplicate reads</td>
                <td align="center" valign="middle">Mark Duplicates (Basic QC, 1.3)</td>
                <td align="center" valign="middle">2 min</td>
                <td align="center" valign="middle">Marks duplicate reads</td>
                <td align="center" valign="middle">20 min</td>
              </tr>
              <tr>
                <td align="center" valign="middle">1.6 Gb SAM file</td>
                <td align="center" valign="middle">Reports the alignment metric of a SAM/BAM file</td>
                <td align="center" valign="middle">Collect Alignment Summary Metrics (Advanced QC, 1.4)</td>
                <td align="center" valign="middle">2 min</td>
                <td align="center" valign="middle">SAM/BAM Alignment Summary Metrics</td>
                <td align="center" valign="middle">6 min</td>
              </tr>
              <tr>
                <td align="center" valign="middle">1.6 Gb SAM file</td>
                <td align="center" valign="middle">Reports the SAM/BAM GCbias metrics</td>
                <td align="center" valign="middle">Collect GC Bias Metrics (Advanced QC, 1.4)</td>
                <td align="center" valign="middle">3 min</td>
                <td align="center" valign="middle">SAM/BAM GC Bias Metrics</td>
                <td align="center" valign="middle">7 min</td>
              </tr>
              <tr>
                <td align="center" valign="middle">1.6 Gb SAM file</td>
                <td align="center" valign="middle">Reports the insert size metrics</td>
                <td align="center" valign="middle">Collect Insert Size Metrics (Advanced QC, 1.4)</td>
                <td align="center" valign="middle">2 min</td>
                <td align="center" valign="middle">Insertion size metrics for PAIRED data</td>
                <td align="center" valign="middle">6 min</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>All of the performances of the GPCG platform were better compared to the run time of Galaxy. Furthermore, a bottleneck in the timing of analysis is the upload of to the Galaxy webserver, together with the not predictable waiting in queue time for the processes (<xref ref-type="table" rid="genes-03-00545-t004">Table 4</xref>). As the status of the process cannot be checked during the execution, the user must wait the end of the run of a process to input the results in to the next step without any hint about the elapsed time, and without the possibility to automatically keep track of the run times.</p>
      </sec>
      <sec>
        <title>2.6. Evaluation with Real Data</title>
        <p>We also evaluated the performance of our GPCG workflows in a “real life scenario”. <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Table S2</xref> reports the performances of the whole-genome alignment with BWA of an entire Illumina flowcell with an average of 130 million Illumina PE reads per lane. The average time required for the alignment of one lane was 6 hours, and for the whole flowcell was roughly 48 hours using GPCG. The output of the alignment was one SAM alignment file for each lane. In all large scale NGS data analysis it is important to manage the flow of both data and input/output files as they traverse complex workflows. The GPCG provides a simple way to set up the input file section (see magnified data source panel on the right, <xref ref-type="fig" rid="genes-03-00545-f009">Figure 9</xref>), manage the naming of output files from individual modules using the transformation tool within a module definition (<xref ref-type="fig" rid="genes-03-00545-f009">Figure 9</xref>). </p>
        <fig id="genes-03-00545-f009" position="float">
          <label>Figure 9</label>
          <caption>
            <p>Snapshot from the module we used to run the alignment of an entire flowcell with BWA-PE. This workflow includes the indexing of the reference genome (BWA: Index), the alignment of the two reads separately (BWA-aln) and their final combination (BWA: samse/sampe). The sixteen input files (<italic>i.e.</italic>, one forward and one reverse read for each one of the eight lanes of the flowcell) are shown in the data source panel magnified on the right. The pipeline allows managing all the options of the BWA alignment software through the module’s GUI without worrying about complex command lines.</p>
          </caption>
          <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g009.tif"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>3. Discussion</title>
      <p>The GPCG was developed with a flexible graphical interface for efficient biomedical computing and distributed informatics research [<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>], and is intended to satisfy the needs of geneticists and computational scientists who are interested in whole genome, exome and targeted sequencing. The graphical workflows we have developed save researchers the time needed to implement and test the command lines of individual analytical steps, logically group complex operations into re-usable units, and allow these units to be aggregated into larger analytical workflows (e.g., QC following alignment).</p>
      <p>The GPCG offers several advantages if compared to the available workflow environments for DNA-Seq data analysis. The GPCG includes a set of pipelines ready-to-go with modules logically interconnected between each other on the basis of the current analytical trends, while others offer a set of single modular routines that the user cannot connect in workflows. Moreover the LONI environment doesn’t display any restrictive upper space limits on storage and on the available per-process RAM, and even more importantly doesn’t require any data upload, thus eliminating bottlenecks with data staging to/from the servers. The limit of 100 GB of storage and 64 GB of RAM in Galaxy, together with the unpredictably long data upload time, don’t fit well with the needs of users analyzing whole genome sequencing data, as the overall size of the original forward and reverse fastq files for a single whole 30× genome sequenced in paired end is already ~100 GB. We tested the single modules embedded both in Galaxy and GPCG with the same input files and analytical parameters (<xref ref-type="table" rid="genes-03-00545-t004">Table 4</xref>). The performance of the GPCG in term of run time was better compared to Galaxy for all the modules, with the additional time saved in GPCG (~3 hours) as no data upload on the system is required. The user of the GPCG has also the ability to disconnect and reconnect to running workflows, and to monitor at any time the progress and to check the status of a previously initiated process, with a detailed and interactive report of time performances and job execution info (e.g., output and error streams).</p>
      <p>Since GPCG is an open source, the user can access the current version of the GPCG pipeline online at <uri>http://pipeline.loni.ucla.edu</uri>. The entire newly developed computational-genomics infrastructure LONI pipeline includes the pipeline server (<uri>http://pipeline.loni.ucla.edu/DPS</uri>), the pipeline web-start server (<uri>http://pipeline.loni.ucla.edu/PWS</uri>), the genomics workflows (<uri>http://pipeline.loni.ucla.edu/services/library-navigator/</uri>), collaborative wiki documentation for these protocols (<uri>http://ucla.in/pbMgUm</uri>), and community support (<uri>http://informatics.googlecode.com/</uri>). All these resources have been developed and are currently supported via an open and collaborative infrastructure. Constructive utilization of diverse tools and computational expertise may be shared as pipeline workflows between professionals, novice users and trainees [<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>,<xref ref-type="bibr" rid="B70-genes-03-00545">70</xref>].</p>
      <p>In this first release of the GPCG we embedded only some popular tools for managing and analyzing DNA-Seq data from the initial raw reads to variant calling and annotation. However, to overcome this limitation, we are regularly testing and adding tools to be shared in the future releases of the genomics pipeline. Also the users can integrate new processes and implement new workflows promoting a community-based protocol validation and openly share and disseminate knowledge, tools and resources.</p>
    </sec>
    <sec sec-type="methods">
      <title>4. Methods</title>
      <sec>
        <title>4.1. The LONI Environment and Workflow Creation</title>
        <p>To translate DNA-Seq data analysis into a graphical pipeline solution within the LONI environment [<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>] we initially described all global processes in the <italic>protocol design</italic> or<italic> skeletonization</italic> step, using a top-down approach. We outlined the general classes of sequence data analysis, then the appropriate sub-classes of analyses, specific tools, test-data, invocation of concrete tools, and a detailed example of executable syntax for each step (<uri>http://pipeline.loni.ucla.edu/support/user-guide/building-a-workflow</uri>, [<xref ref-type="bibr" rid="B65-genes-03-00545">65</xref>]). The previously mentioned analytical steps are constructed from a series of command line executable processes, referred to as <italic>modules</italic> or <italic>nodes</italic>,   which are connected to each other to form a visual workflow analysis protocol. All the logically concatenated <italic>modules/nodes</italic> involved in the same analytical step (e.g., alignment) comprise a workflow. </p>
        <p>After all necessary modules that make up a workflow are independently defined and validated through the LONI pipeline GUI interface (<uri>http://pipeline.loni.ucla.edu/support/user-guide/creating-modules/</uri>), they are integrated into a coherent workflow.</p>
      </sec>
      <sec>
        <title>4.2. Accessibility of the GPCG</title>
        <p>The user can access the current version of the GPCG online at <uri>http://pipeline.loni.ucla.edu</uri>. The local Client can be set up by the users following the instructions reported at <uri>http://pipeline.loni.ucla.edu/support/user-guide/installation/</uri>. The GPCG can also be deployed on a server base downloading the distributed pipeline server installer (DPS) (<uri>http://pipeline.loni.ucla.edu/support/server-guide/installation/</uri>), and following the instructions available at <uri>http://pipeline.loni.ucla.edu/DPS</uri>). The GPCG workflows can also be directly launched via the pipeline web-start server (PWS) (<uri>http://pipeline.loni.ucla.edu/PWS</uri>). To search across the entire set of workflows the user can rely on an interactive graphical navigator (<uri>http://pipeline.loni.ucla.edu/services/library-navigator/</uri>), which enables not only the discovery, but also web-based utilization of this new computational-genomics infrastructure through the web, <xref ref-type="fig" rid="genes-03-00545-f010">Figure 10</xref>.</p>
        
      </sec>
      <sec>
        <title>4.3. Evaluation of the GPCG Workflows with Simulated and Real Data</title>
        <p><xref ref-type="table" rid="genes-03-00545-t001">Table 1</xref> presents the software we have embedded in the workflows released with GPCG. We evaluated the workflows with both real and simulated data (<xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Table S2</xref>).</p>
        <p><italic>Evaluation with simulated data</italic> We used dwgsim [<xref ref-type="bibr" rid="B84-genes-03-00545">84</xref>], a utility for whole-genome Illumina reads simulation, contained in DNAA v0.1.2 (<uri>http://sourceforge.net/projects/dnaa/</uri>), to generate Illumina-like short sequences, using the default empirical error model illustrated on DNAA’s Whole-Genome Simulation web-site (<uri>http://dnaa.sf.net</uri>). In total we generated 30 million reads with 100 bp length, using the complete human genome (hg18) as a reference and with default parameters. We developed also a module that allows the users to generate simulation datasets according to their needs (see <xref ref-type="table" rid="genes-03-00545-t002">Table 2</xref> and <xref ref-type="supplementary-material" rid="genes-03-00545-s001">Supplementary Materials S2</xref>). </p>
        
        <p><italic>Evaluation with real data</italic> To further compare the behavior of our workflows on real applications, we used an entire flow cell (8 lanes) with an average of 130 million Illumina PE reads per lane with length of 100 bp (fastq produced with the Illumina Pipeline v1.8) to be aligned by BWA-PE against the whole human genome sequences (assembly: NCBI36.1/hg18). </p>
        <fig id="genes-03-00545-f010" position="float">
          <label>Figure 10</label>
          <caption>
            <p>The LONI pipeline computational library Navigator allows the interactive traversal, inspection, downloading and utilization of specific NGS analyses. Nested insert images illustrate the most common steps of search, selection, comparison, modification and execution of available end-to-end computational genomics workflows.</p>
          </caption>
          <graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="genes-03-00545-g010.tif"/>
        </fig>
        <p>These experiments were produced on an Illumina HiSeq200 DNA sequencer using v5 kits.</p>
        <p>All the testing runs were performed on a Linux server Dual Intel Xeon X5650 2.67 Ghz 6 core processors 96 GB’s of RAM 2 × 10 k RPM 150 GB WD VelociRaptor RAID 0 RHEL 5 64-bit. </p>
      </sec>
    </sec>
    <sec sec-type="conclusions">
      <title>5. Conclusions</title>
      <p>The availability of workflows to manage and analyze NGS data in a straightforward way may play a role in triggering genomic advances in human health related translational research in psychiatry genetics. Currently, NGS technology is emerging as a fundamental basis on which to understand disease complexity and heterogeneity both for common and rare diseases, with benefits to clinical diagnostics and care once research findings are translated into clinical tests. Sequencing clinical subjects is becoming a method of choice in translational studies of diseases, and genetic defects underlying several genetic disorders have been identified through whole exome or whole genome sequencing [<xref ref-type="bibr" rid="B85-genes-03-00545">85</xref>,<xref ref-type="bibr" rid="B86-genes-03-00545">86</xref>]. Nevertheless, an understanding of how genome variability leads to disease pathogenesis is still far from complete for the vast majority of genetic diseases [<xref ref-type="bibr" rid="B56-genes-03-00545">56</xref>], at least as to the meaning of many variants present in the genome of healthy individuals. The computational challenge for DNA-Seq data analysis is often a bottleneck, as many different tools are constantly emerging, and often requiring bioinformatics skills. We are proposing the GPCG as a handy and helpful graphical analysis platform to improve the efficiency of high-throughput data analysis in diverse applications of DNA-Seq analysis projects. There are two tiers of validations for the proposed pipeline GPCG pipeline infrastructure. The first tier is validation of the technical protocols (as we compared the results of the GPCG pipeline protocol against Galaxy). This evaluation confirms the programmatic reliability and reproducibility of the results using identical computational libraries. However, the second tier of validation is more important as it provides scientific evidence of the value added by the new GPCG pipeline infrastructure. This scientific validation includes for example the comparison of results obtained with different algorithms in term of read mapping, variant calling and quality control procedures to find the process that best fits the data, together with improvements in speed and high-throughput volume processing. For example, the GPCG framework may be used to construct a new genomics computing protocol that explicitly utilizes specific sequence analysis tools. The entire pipeline protocol may be shared with other users who can easily plug-in new data and/or swap alternative modules for analogous processing steps (employing different software tools). Such multi-investigator experimental studies would provide cues about how to select appropriate software tools and how different libraries compare in processing different type data sets (e.g., varying read length, fragments or paired ends, highly repetitive genome, <italic>etc</italic>.)</p>
    </sec>
   
  </body>
  <back>
   <ack>
      <title>Acknowledgments</title>
      <p>This work was funded in part by the National Institutes of Health through Grants U54 RR021813, 9P41EB015922-15, 2-P41-RR-013642-15, R01 MH71940, U24-RR025736, U24-RR021992, U24-RR021760, U24-RR026057 and NSF grants 0716055 and 1023115. We are also indebted to the members of the Laboratory of Neuro Imaging (LONI), the Biomedical Informatics Research Network (BIRN), and Clinical and Translational Science Award (CTSA) investigators, NIH Program officials, and many general users for their patience with beta-testing the Pipeline and for providing useful feedback about its state, functionality and usability. Benjamin Berman and Zack Ramjan from the USC Epigenome Center, University of Southern California, provided help with definition of sequence pre-processing. The authors also thank Bernard Chang for assistance with system administration of the UCI Pipeline servers.</p>
    </ack>
    <notes>
      <title>Conflicts of Interest</title>
      <p>The authors declare no conflict of interest.</p>
    </notes>
    <ref-list>
      <title>References</title>
      <ref id="B1-genes-03-00545">
        <label>1.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Dalca</surname>
              <given-names>A.V.</given-names>
            </name>
            <name>
              <surname>Brudno</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Genome variation discovery with high-throughput sequencing data</article-title>
          <source>Brief. Bioinform.</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>3</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1093/bib/bbp058</pub-id>
        </citation>
      </ref>
      <ref id="B2-genes-03-00545">
        <label>2.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Alkan</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Coe</surname>
              <given-names>B.P.</given-names>
            </name>
            <name>
              <surname>Eichler</surname>
              <given-names>E.E.</given-names>
            </name>
          </person-group>
          <article-title>Genome structural variation discovery and genotyping</article-title>
          <source>Nat. Rev. Genet.</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>363</fpage>
          <lpage>376</lpage>
          <pub-id pub-id-type="doi">10.1038/nrg2958</pub-id>
        </citation>
      </ref>
      <ref id="B3-genes-03-00545">
        <label>3.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Flicek</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Birney</surname>
              <given-names>E.</given-names>
            </name>
          </person-group>
          <article-title>Sense from sequence reads: Methods for alignment and assembly</article-title>
          <source>Nat. Methods</source>
          <year>2009</year>
          <volume>6</volume>
          <fpage>S6</fpage>
          <lpage>S12</lpage>
          <pub-id pub-id-type="doi">10.1038/nmeth.1376</pub-id>
        </citation>
      </ref>
      <ref id="B4-genes-03-00545">
        <label>4.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Pepke</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Wold</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Mortazavi</surname>
              <given-names>A.</given-names>
            </name>
          </person-group>
          <article-title>Computation for chip-seq and rna-seq studies</article-title>
          <source>Nat. Methods</source>
          <year>2009</year>
          <volume>6</volume>
          <fpage>S22</fpage>
          <lpage>S32</lpage>
          <pub-id pub-id-type="doi">10.1038/nmeth.1371</pub-id>
        </citation>
      </ref>
      <ref id="B5-genes-03-00545">
        <label>5.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Meaburn</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Schulz</surname>
              <given-names>R.</given-names>
            </name>
          </person-group>
          <article-title>Next generation sequencing in epigenetics: Insights and challenges</article-title>
          <source>Semin. Cell Dev. Biol.</source>
          <year>2011</year>
          <volume>23</volume>
          <fpage>192</fpage>
          <lpage>199</lpage>
        <pub-id pub-id-type="pmid">22027613</pub-id></citation>
      </ref>
      <ref id="B6-genes-03-00545">
        <label>6.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Walsh</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>McClellan</surname>
              <given-names>J.M.</given-names>
            </name>
            <name>
              <surname>McCarthy</surname>
              <given-names>S.E.</given-names>
            </name>
            <name>
              <surname>Addington</surname>
              <given-names>A.M.</given-names>
            </name>
            <name>
              <surname>Pierce</surname>
              <given-names>S.B.</given-names>
            </name>
            <name>
              <surname>Cooper</surname>
              <given-names>G.M.</given-names>
            </name>
            <name>
              <surname>Nord</surname>
              <given-names>A.S.</given-names>
            </name>
            <name>
              <surname>Kusenda</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Malhotra</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Bhandari</surname>
              <given-names>A.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Rare structural variants disrupt multiple genes in neurodevelopmental pathways in schizophrenia</article-title>
          <source>Science</source>
          <year>2008</year>
          <volume>320</volume>
          <fpage>539</fpage>
          <lpage>543</lpage>
        <pub-id pub-id-type="doi">10.1126/science.1155174</pub-id><pub-id pub-id-type="pmid">18369103</pub-id></citation>
      </ref>
      <ref id="B7-genes-03-00545">
        <label>7.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Rumble</surname>
              <given-names>S.M.</given-names>
            </name>
            <name>
              <surname>Lacroute</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Dalca</surname>
              <given-names>A.V.</given-names>
            </name>
            <name>
              <surname>Fiume</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Sidow</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Brudno</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Shrimp: Accurate mapping of short color-space reads</article-title>
          <source>PLoS Comput. Biol.</source>
          <year>2009</year>
          <volume>5</volume>
          <fpage>e1000386</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000386</pub-id>
        </citation>
      </ref>
      <ref id="B8-genes-03-00545">
        <label>8.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Lin</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Zhang</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Zhang</surname>
              <given-names>M.Q.</given-names>
            </name>
            <name>
              <surname>Ma</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Zoom! Zillions of oligos mapped</article-title>
          <source>Bioinformatics</source>
          <year>2008</year>
          <volume>24</volume>
          <fpage>2431</fpage>
          <lpage>2437</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btn416</pub-id><pub-id pub-id-type="pmid">18684737</pub-id></citation>
      </ref>
      <ref id="B9-genes-03-00545">
        <label>9.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Yu</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Lam</surname>
              <given-names>T.W.</given-names>
            </name>
            <name>
              <surname>Yiu</surname>
              <given-names>S.M.</given-names>
            </name>
            <name>
              <surname>Kristiansen</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Wang</surname>
              <given-names>J.</given-names>
            </name>
          </person-group>
          <article-title>Soap2: An improved ultrafast tool for short read alignment</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>1966</fpage>
          <lpage>1967</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btp336</pub-id>
        </citation>
      </ref>
      <ref id="B10-genes-03-00545">
        <label>10.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Chen</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Souaiaia</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Chen</surname>
              <given-names>T.</given-names>
            </name>
          </person-group>
          <article-title>Perm: Efficient mapping of short sequencing reads with periodic full sensitive spaced seeds</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>2514</fpage>
          <lpage>2521</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btp486</pub-id><pub-id pub-id-type="pmid">19675096</pub-id></citation>
      </ref>
      <ref id="B11-genes-03-00545">
        <label>11.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Langmead</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Trapnell</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Pop</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Salzberg</surname>
              <given-names>S.L.</given-names>
            </name>
          </person-group>
          <article-title>Ultrafast and memory-efficient alignment of short DNA sequences to the human genome</article-title>
          <source>Genome Biol.</source>
          <year>2009</year>
          <volume>10</volume>
          <fpage>R25</fpage>
          <pub-id pub-id-type="doi">10.1186/gb-2009-10-3-r25</pub-id>
        </citation>
      </ref>
      <ref id="B12-genes-03-00545">
        <label>12.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Durbin</surname>
              <given-names>R.</given-names>
            </name>
          </person-group>
          <article-title>Fast and accurate short read alignment with burrows-wheeler transform</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>1754</fpage>
          <lpage>1760</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btp324</pub-id>
        </citation>
      </ref>
      <ref id="B13-genes-03-00545">
        <label>13.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Chen</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Wallis</surname>
              <given-names>J.W.</given-names>
            </name>
            <name>
              <surname>McLellan</surname>
              <given-names>M.D.</given-names>
            </name>
            <name>
              <surname>Larson</surname>
              <given-names>D.E.</given-names>
            </name>
            <name>
              <surname>Kalicki</surname>
              <given-names>J.M.</given-names>
            </name>
            <name>
              <surname>Pohl</surname>
              <given-names>C.S.</given-names>
            </name>
            <name>
              <surname>McGrath</surname>
              <given-names>S.D.</given-names>
            </name>
            <name>
              <surname>Wendl</surname>
              <given-names>M.C.</given-names>
            </name>
            <name>
              <surname>Zhang</surname>
              <given-names>Q.</given-names>
            </name>
            <name>
              <surname>Locke</surname>
              <given-names>D.P.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Breakdancer: An algorithm for high-resolution mapping of genomic structural variation</article-title>
          <source>Nat. Methods</source>
          <year>2009</year>
          <volume>6</volume>
          <fpage>677</fpage>
          <lpage>681</lpage>
        <pub-id pub-id-type="doi">10.1038/nmeth.1363</pub-id><pub-id pub-id-type="pmid">19668202</pub-id></citation>
      </ref>
      <ref id="B14-genes-03-00545">
        <label>14.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Olson</surname>
              <given-names>S.A.</given-names>
            </name>
          </person-group>
          <article-title>Emboss opens up sequence analysis. European molecular biology open software suite</article-title>
          <source>Brief. Bioinform.</source>
          <year>2002</year>
          <volume>3</volume>
          <fpage>87</fpage>
          <lpage>91</lpage>
          <pub-id pub-id-type="doi">10.1093/bib/3.1.87</pub-id>
        </citation>
      </ref>
      <ref id="B15-genes-03-00545">
        <label>15.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Myers</surname>
              <given-names>E.W.</given-names>
            </name>
          </person-group>
          <article-title>Toward simplifying and accurately formulating fragment assembly</article-title>
          <source>J. Comput. Biol.</source>
          <year>1995</year>
          <volume>2</volume>
          <fpage>275</fpage>
          <lpage>290</lpage>
          <pub-id pub-id-type="doi">10.1089/cmb.1995.2.275</pub-id>
        </citation>
      </ref>
      <ref id="B16-genes-03-00545">
        <label>16.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Pevzner</surname>
              <given-names>P.A.</given-names>
            </name>
            <name>
              <surname>Tang</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Waterman</surname>
              <given-names>M.S.</given-names>
            </name>
          </person-group>
          <article-title>An eulerian path approach to DNA fragment assembly</article-title>
          <source>Proc. Natl. Acad. Sci. USA</source>
          <year>2001</year>
          <volume>98</volume>
          <fpage>9748</fpage>
          <lpage>9753</lpage>
          <pub-id pub-id-type="doi">10.1073/pnas.171285098</pub-id>
        </citation>
      </ref>
      <ref id="B17-genes-03-00545">
        <label>17.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Myers</surname>
              <given-names>E.W.</given-names>
            </name>
            <name>
              <surname>Sutton</surname>
              <given-names>G.G.</given-names>
            </name>
            <name>
              <surname>Delcher</surname>
              <given-names>A.L.</given-names>
            </name>
            <name>
              <surname>Dew</surname>
              <given-names>I.M.</given-names>
            </name>
            <name>
              <surname>Fasulo</surname>
              <given-names>D.P.</given-names>
            </name>
            <name>
              <surname>Flanigan</surname>
              <given-names>M.J.</given-names>
            </name>
            <name>
              <surname>Kravitz</surname>
              <given-names>S.A.</given-names>
            </name>
            <name>
              <surname>Mobarry</surname>
              <given-names>C.M.</given-names>
            </name>
            <name>
              <surname>Reinert</surname>
              <given-names>K.H.</given-names>
            </name>
            <name>
              <surname>Remington</surname>
              <given-names>K.A.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>A whole-genome assembly of drosophila</article-title>
          <source>Science</source>
          <year>2000</year>
          <volume>287</volume>
          <fpage>2196</fpage>
          <lpage>2204</lpage>
        <pub-id pub-id-type="doi">10.1126/science.287.5461.2196</pub-id><pub-id pub-id-type="pmid">10731133</pub-id></citation>
      </ref>
      <ref id="B18-genes-03-00545">
        <label>18.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Jaffe</surname>
              <given-names>D.B.</given-names>
            </name>
            <name>
              <surname>Butler</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Gnerre</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Mauceli</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Lindblad-Toh</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Mesirov</surname>
              <given-names>J.P.</given-names>
            </name>
            <name>
              <surname>Zody</surname>
              <given-names>M.C.</given-names>
            </name>
            <name>
              <surname>Lander</surname>
              <given-names>E.S.</given-names>
            </name>
          </person-group>
          <article-title>Whole-genome sequence assembly for mammalian genomes: Arachne 2</article-title>
          <source>Genome Res.</source>
          <year>2003</year>
          <volume>13</volume>
          <fpage>91</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.828403</pub-id>
        </citation>
      </ref>
      <ref id="B19-genes-03-00545">
        <label>19.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Zerbino</surname>
              <given-names>D.R.</given-names>
            </name>
            <name>
              <surname>Birney</surname>
              <given-names>E.</given-names>
            </name>
          </person-group>
          <article-title>Velvet: Algorithms for <italic>de novo</italic> short read assembly using de bruijn graphs</article-title>
          <source>Genome Res.</source>
          <year>2008</year>
          <volume>18</volume>
          <fpage>821</fpage>
          <lpage>829</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.074492.107</pub-id>
        </citation>
      </ref>
      <ref id="B20-genes-03-00545">
        <label>20.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Zhu</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Ruan</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Qian</surname>
              <given-names>W.</given-names>
            </name>
            <name>
              <surname>Fang</surname>
              <given-names>X.</given-names>
            </name>
            <name>
              <surname>Shi</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Shan</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Kristiansen</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Yang</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Wang</surname>
              <given-names>J.</given-names>
            </name>
          </person-group>
          <article-title><italic>De novo</italic> assembly of human genomes with massively parallel short read sequencing</article-title>
          <source>Genome Res.</source>
          <year>2010</year>
          <volume>20</volume>
          <fpage>265</fpage>
          <lpage>272</lpage>
        <pub-id pub-id-type="doi">10.1101/gr.097261.109</pub-id><pub-id pub-id-type="pmid">20019144</pub-id></citation>
      </ref>
      <ref id="B21-genes-03-00545">
        <label>21.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Simpson</surname>
              <given-names>J.T.</given-names>
            </name>
            <name>
              <surname>Wong</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Jackman</surname>
              <given-names>S.D.</given-names>
            </name>
            <name>
              <surname>Schein</surname>
              <given-names>J.E.</given-names>
            </name>
            <name>
              <surname>Jones</surname>
              <given-names>S.J.</given-names>
            </name>
            <name>
              <surname>Birol</surname>
              <given-names>I.</given-names>
            </name>
          </person-group>
          <article-title>Abyss: A parallel assembler for short read sequence data</article-title>
          <source>Genome Res.</source>
          <year>2009</year>
          <volume>19</volume>
          <fpage>1117</fpage>
          <lpage>1123</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.089532.108</pub-id>
        </citation>
      </ref>
      <ref id="B22-genes-03-00545">
        <label>22.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Miller</surname>
              <given-names>J.R.</given-names>
            </name>
            <name>
              <surname>Koren</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Sutton</surname>
              <given-names>G.</given-names>
            </name>
          </person-group>
          <article-title>Assembly algorithms for next-generation sequencing data</article-title>
          <source>Genomics</source>
          <year>2010</year>
          <volume>95</volume>
          <fpage>315</fpage>
          <lpage>327</lpage>
        <pub-id pub-id-type="doi">10.1016/j.ygeno.2010.03.001</pub-id><pub-id pub-id-type="pmid">20211242</pub-id></citation>
      </ref>
      <ref id="B23-genes-03-00545">
        <label>23.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ewing</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Green</surname>
              <given-names>P.</given-names>
            </name>
          </person-group>
          <article-title>Base-calling of automated sequencer traces using phred. Ii. Error probabilities</article-title>
          <source>Genome Res.</source>
          <year>1998</year>
          <volume>8</volume>
          <fpage>186</fpage>
          <lpage>194</lpage>
        <pub-id pub-id-type="pmid">9521922</pub-id></citation>
      </ref>
      <ref id="B24-genes-03-00545">
        <label>24.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Brockman</surname>
              <given-names>W.</given-names>
            </name>
            <name>
              <surname>Alvarez</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Young</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Garber</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Giannoukos</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Lee</surname>
              <given-names>W.L.</given-names>
            </name>
            <name>
              <surname>Russ</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Lander</surname>
              <given-names>E.S.</given-names>
            </name>
            <name>
              <surname>Nusbaum</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Jaffe</surname>
              <given-names>D.B.</given-names>
            </name>
          </person-group>
          <article-title>Quality scores and snp detection in sequencing-by-synthesis systems</article-title>
          <source>Genome Res.</source>
          <year>2008</year>
          <volume>18</volume>
          <fpage>763</fpage>
          <lpage>770</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.070227.107</pub-id>
        </citation>
      </ref>
      <ref id="B25-genes-03-00545">
        <label>25.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Nordborg</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>L.M.</given-names>
            </name>
          </person-group>
          <article-title>Adjust quality scores from alignment and improve sequencing accuracy</article-title>
          <source>Nucleic Acids Res.</source>
          <year>2004</year>
          <volume>32</volume>
          <fpage>5183</fpage>
          <lpage>5191</lpage>
        <pub-id pub-id-type="doi">10.1093/nar/gkh850</pub-id><pub-id pub-id-type="pmid">15459287</pub-id></citation>
      </ref>
      <ref id="B26-genes-03-00545">
        <label>26.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Fang</surname>
              <given-names>X.</given-names>
            </name>
            <name>
              <surname>Yang</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Wang</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Kristiansen</surname>
              <given-names>K.</given-names>
            </name>
          </person-group>
          <article-title>Snp detection for massively parallel whole-genome resequencing</article-title>
          <source>Genome Res.</source>
          <year>2009</year>
          <volume>19</volume>
          <fpage>1124</fpage>
          <lpage>1132</lpage>
        <pub-id pub-id-type="doi">10.1101/gr.088013.108</pub-id><pub-id pub-id-type="pmid">19420381</pub-id></citation>
      </ref>
      <ref id="B27-genes-03-00545">
        <label>27.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>DePristo</surname>
              <given-names>M.A.</given-names>
            </name>
            <name>
              <surname>Banks</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Poplin</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Garimella</surname>
              <given-names>K.V.</given-names>
            </name>
            <name>
              <surname>Maguire</surname>
              <given-names>J.R.</given-names>
            </name>
            <name>
              <surname>Hartl</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Philippakis</surname>
              <given-names>A.A.</given-names>
            </name>
            <name>
              <surname>del Angel</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Rivas</surname>
              <given-names>M.A.</given-names>
            </name>
            <name>
              <surname>Hanna</surname>
              <given-names>M.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>A framework for variation discovery and genotyping using next-generation DNA sequencing data</article-title>
          <source>Nat. Genet.</source>
          <year>2011</year>
          <volume>43</volume>
          <fpage>491</fpage>
          <lpage>498</lpage>
        <pub-id pub-id-type="doi">10.1038/ng.806</pub-id><pub-id pub-id-type="pmid">21478889</pub-id></citation>
      </ref>
      <ref id="B28-genes-03-00545">
        <label>28.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ning</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Cox</surname>
              <given-names>A.J.</given-names>
            </name>
            <name>
              <surname>Mullikin</surname>
              <given-names>J.C.</given-names>
            </name>
          </person-group>
          <article-title>Ssaha: A fast search method for large DNA databases</article-title>
          <source>Genome Res.</source>
          <year>2001</year>
          <volume>11</volume>
          <fpage>1725</fpage>
          <lpage>1729</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.194201</pub-id>
        </citation>
      </ref>
      <ref id="B29-genes-03-00545">
        <label>29.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Martin</surname>
              <given-names>M.V.</given-names>
            </name>
            <name>
              <surname>Rollins</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Sequeira</surname>
              <given-names>P.A.</given-names>
            </name>
            <name>
              <surname>Mesen</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Byerley</surname>
              <given-names>W.</given-names>
            </name>
            <name>
              <surname>Stein</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Moon</surname>
              <given-names>E.A.</given-names>
            </name>
            <name>
              <surname>Akil</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Jones</surname>
              <given-names>E.G.</given-names>
            </name>
            <name>
              <surname>Watson</surname>
              <given-names>S.J.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Exon expression in lymphoblastoid cell lines from subjects with schizophrenia before and after glucose deprivation</article-title>
          <source>BMC Med. Genomics</source>
          <year>2009</year>
          <volume>2</volume>
          <fpage>62</fpage>
          <pub-id pub-id-type="doi">10.1186/1755-8794-2-62</pub-id>
        </citation>
      </ref>
      <ref id="B30-genes-03-00545">
        <label>30.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Drmanac</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Sparks</surname>
              <given-names>A.B.</given-names>
            </name>
            <name>
              <surname>Callow</surname>
              <given-names>M.J.</given-names>
            </name>
            <name>
              <surname>Halpern</surname>
              <given-names>A.L.</given-names>
            </name>
            <name>
              <surname>Burns</surname>
              <given-names>N.L.</given-names>
            </name>
            <name>
              <surname>Kermani</surname>
              <given-names>B.G.</given-names>
            </name>
            <name>
              <surname>Carnevali</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Nazarenko</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Nilsen</surname>
              <given-names>G.B.</given-names>
            </name>
            <name>
              <surname>Yeung</surname>
              <given-names>G.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Human genome sequencing using unchained base reads on self-assembling DNA nanoarrays</article-title>
          <source>Science</source>
          <year>2010</year>
          <volume>327</volume>
          <fpage>78</fpage>
          <lpage>81</lpage>
        <pub-id pub-id-type="doi">10.1126/science.1181498</pub-id><pub-id pub-id-type="pmid">19892942</pub-id></citation>
      </ref>
      <ref id="B31-genes-03-00545">
        <label>31.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Bentley</surname>
              <given-names>D.R.</given-names>
            </name>
            <name>
              <surname>Balasubramanian</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Swerdlow</surname>
              <given-names>H.P.</given-names>
            </name>
            <name>
              <surname>Smith</surname>
              <given-names>G.P.</given-names>
            </name>
            <name>
              <surname>Milton</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Brown</surname>
              <given-names>C.G.</given-names>
            </name>
            <name>
              <surname>Hall</surname>
              <given-names>K.P.</given-names>
            </name>
            <name>
              <surname>Evers</surname>
              <given-names>D.J.</given-names>
            </name>
            <name>
              <surname>Barnes</surname>
              <given-names>C.L.</given-names>
            </name>
            <name>
              <surname>Bignell</surname>
              <given-names>H.R.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Accurate whole human genome sequencing using reversible terminator chemistry</article-title>
          <source>Nature</source>
          <year>2008</year>
          <volume>456</volume>
          <fpage>53</fpage>
          <lpage>59</lpage>
        <pub-id pub-id-type="doi">10.1038/nature07517</pub-id><pub-id pub-id-type="pmid">18987734</pub-id></citation>
      </ref>
      <ref id="B32-genes-03-00545">
        <label>32.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Koboldt</surname>
              <given-names>D.C.</given-names>
            </name>
            <name>
              <surname>Chen</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Wylie</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Larson</surname>
              <given-names>D.E.</given-names>
            </name>
            <name>
              <surname>McLellan</surname>
              <given-names>M.D.</given-names>
            </name>
            <name>
              <surname>Mardis</surname>
              <given-names>E.R.</given-names>
            </name>
            <name>
              <surname>Weinstock</surname>
              <given-names>G.M.</given-names>
            </name>
            <name>
              <surname>Wilson</surname>
              <given-names>R.K.</given-names>
            </name>
            <name>
              <surname>Ding</surname>
              <given-names>L.</given-names>
            </name>
          </person-group>
          <article-title>Varscan: Variant detection in massively parallel sequencing of individual and pooled samples</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>2283</fpage>
          <lpage>2285</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btp373</pub-id>
        </citation>
      </ref>
      <ref id="B33-genes-03-00545">
        <label>33.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Wheeler</surname>
              <given-names>D.A.</given-names>
            </name>
            <name>
              <surname>Srinivasan</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Egholm</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Shen</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Chen</surname>
              <given-names>L.</given-names>
            </name>
            <name>
              <surname>McGuire</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>He</surname>
              <given-names>W.</given-names>
            </name>
            <name>
              <surname>Chen</surname>
              <given-names>Y.J.</given-names>
            </name>
            <name>
              <surname>Makhijani</surname>
              <given-names>V.</given-names>
            </name>
            <name>
              <surname>Roth</surname>
              <given-names>G.T.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>The complete genome of an individual by massively parallel DNA sequencing</article-title>
          <source>Nature</source>
          <year>2008</year>
          <volume>452</volume>
          <fpage>872</fpage>
          <lpage>876</lpage>
        <pub-id pub-id-type="doi">10.1038/nature06884</pub-id><pub-id pub-id-type="pmid">18421352</pub-id></citation>
      </ref>
      <ref id="B34-genes-03-00545">
        <label>34.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Mokry</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Feitsma</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Nijman</surname>
              <given-names>I.J.</given-names>
            </name>
            <name>
              <surname>de Bruijn</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>van der Zaag</surname>
              <given-names>P.J.</given-names>
            </name>
            <name>
              <surname>Guryev</surname>
              <given-names>V.</given-names>
            </name>
            <name>
              <surname>Cuppen</surname>
              <given-names>E.</given-names>
            </name>
          </person-group>
          <article-title>Accurate snp and mutation detection by targeted custom microarray-based genomic enrichment of short-fragment sequencing libraries</article-title>
          <source>Nucleic Acids Res.</source>
          <year>2010</year>
          <volume>38</volume>
          <fpage>e116</fpage>
          <pub-id pub-id-type="doi">10.1093/nar/gkq072</pub-id>
        </citation>
      </ref>
      <ref id="B35-genes-03-00545">
        <label>35.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Shen</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Wan</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Coarfa</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Drabek</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Chen</surname>
              <given-names>L.</given-names>
            </name>
            <name>
              <surname>Ostrowski</surname>
              <given-names>E.A.</given-names>
            </name>
            <name>
              <surname>Liu</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Weinstock</surname>
              <given-names>G.M.</given-names>
            </name>
            <name>
              <surname>Wheeler</surname>
              <given-names>D.A.</given-names>
            </name>
            <name>
              <surname>Gibbs</surname>
              <given-names>R.A.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>A snp discovery method to assess variant allele probability from next-generation resequencing data</article-title>
          <source>Genome Res.</source>
          <year>2010</year>
          <volume>20</volume>
          <fpage>273</fpage>
          <lpage>280</lpage>
        <pub-id pub-id-type="doi">10.1101/gr.096388.109</pub-id><pub-id pub-id-type="pmid">20019143</pub-id></citation>
      </ref>
      <ref id="B36-genes-03-00545">
        <label>36.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Hoberman</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Dias</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Ge</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Harmsen</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Mayhew</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Verlaan</surname>
              <given-names>D.J.</given-names>
            </name>
            <name>
              <surname>Kwan</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Dewar</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Blanchette</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Pastinen</surname>
              <given-names>T.</given-names>
            </name>
          </person-group>
          <article-title>A probabilistic approach for snp discovery in high-throughput human resequencing data</article-title>
          <source>Genome Res.</source>
          <year>2009</year>
          <volume>19</volume>
          <fpage>1542</fpage>
          <lpage>1552</lpage>
        <pub-id pub-id-type="doi">10.1101/gr.092072.109</pub-id><pub-id pub-id-type="pmid">19605794</pub-id></citation>
      </ref>
      <ref id="B37-genes-03-00545">
        <label>37.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Malhis</surname>
              <given-names>N.</given-names>
            </name>
            <name>
              <surname>Jones</surname>
              <given-names>S.J.</given-names>
            </name>
          </person-group>
          <article-title>High quality snp calling using illumina data at shallow coverage</article-title>
          <source>Bioinformatics</source>
          <year>2010</year>
          <volume>26</volume>
          <fpage>1029</fpage>
          <lpage>1035</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btq092</pub-id>
        </citation>
      </ref>
      <ref id="B38-genes-03-00545">
        <label>38.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Handsaker</surname>
              <given-names>R.E.</given-names>
            </name>
            <name>
              <surname>Korn</surname>
              <given-names>J.M.</given-names>
            </name>
            <name>
              <surname>Nemesh</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>McCarroll</surname>
              <given-names>S.A.</given-names>
            </name>
          </person-group>
          <article-title>Discovery and genotyping of genome structural polymorphism by sequencing on a population scale</article-title>
          <source>Nat. Genet.</source>
          <year>2011</year>
          <volume>43</volume>
          <fpage>269</fpage>
          <lpage>276</lpage>
          <pub-id pub-id-type="doi">10.1038/ng.768</pub-id>
        </citation>
      </ref>
      <ref id="B39-genes-03-00545">
        <label>39.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Kim</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Sinha</surname>
              <given-names>S.</given-names>
            </name>
          </person-group>
          <article-title>Indelign: A probabilistic framework for annotation of insertions and deletions in a multiple alignment</article-title>
          <source>Bioinformatics</source>
          <year>2007</year>
          <volume>23</volume>
          <fpage>289</fpage>
          <lpage>297</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btl578</pub-id>
        </citation>
      </ref>
      <ref id="B40-genes-03-00545">
        <label>40.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Ruan</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Durbin</surname>
              <given-names>R.</given-names>
            </name>
          </person-group>
          <article-title>Mapping short DNA sequencing reads and calling variants using mapping quality scores</article-title>
          <source>Genome Res.</source>
          <year>2008</year>
          <volume>18</volume>
          <fpage>1851</fpage>
          <lpage>1858</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.078212.108</pub-id>
        </citation>
      </ref>
      <ref id="B41-genes-03-00545">
        <label>41.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Hormozdiari</surname>
              <given-names>F.</given-names>
            </name>
            <name>
              <surname>Alkan</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Eichler</surname>
              <given-names>E.E.</given-names>
            </name>
            <name>
              <surname>Sahinalp</surname>
              <given-names>S.C.</given-names>
            </name>
          </person-group>
          <article-title>Combinatorial algorithms for structural variation detection in high-throughput sequenced genomes</article-title>
          <source>Genome Res.</source>
          <year>2009</year>
          <volume>19</volume>
          <fpage>1270</fpage>
          <lpage>1278</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.088633.108</pub-id>
        </citation>
      </ref>
      <ref id="B42-genes-03-00545">
        <label>42.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Lee</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Hormozdiari</surname>
              <given-names>F.</given-names>
            </name>
            <name>
              <surname>Alkan</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Brudno</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Modil: Detecting small indels from clone-end sequencing with mixtures of distributions</article-title>
          <source>Nat. Methods</source>
          <year>2009</year>
          <volume>6</volume>
          <fpage>473</fpage>
          <lpage>474</lpage>
          <pub-id pub-id-type="doi">10.1038/nmeth.f.256</pub-id>
        </citation>
      </ref>
      <ref id="B43-genes-03-00545">
        <label>43.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Korbel</surname>
              <given-names>J.O.</given-names>
            </name>
            <name>
              <surname>Abyzov</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Mu</surname>
              <given-names>X.J.</given-names>
            </name>
            <name>
              <surname>Carriero</surname>
              <given-names>N.</given-names>
            </name>
            <name>
              <surname>Cayting</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Zhang</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Snyder</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Gerstein</surname>
              <given-names>M.B.</given-names>
            </name>
          </person-group>
          <article-title>Pemer: A computational framework with simulation-based error models for inferring genomic structural variants from massive paired-end sequencing data</article-title>
          <source>Genome Biol.</source>
          <year>2009</year>
          <volume>10</volume>
          <fpage>R23</fpage>
          <pub-id pub-id-type="doi">10.1186/gb-2009-10-2-r23</pub-id>
        </citation>
      </ref>
      <ref id="B44-genes-03-00545">
        <label>44.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Sindi</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Helman</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Bashir</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Raphael</surname>
              <given-names>B.J.</given-names>
            </name>
          </person-group>
          <article-title>A geometric approach for classification and comparison of structural variants</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>i222</fpage>
          <lpage>i230</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btp208</pub-id>
        </citation>
      </ref>
      <ref id="B45-genes-03-00545">
        <label>45.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Pelak</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Shianna</surname>
              <given-names>K.V.</given-names>
            </name>
            <name>
              <surname>Ge</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Maia</surname>
              <given-names>J.M.</given-names>
            </name>
            <name>
              <surname>Zhu</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Smith</surname>
              <given-names>J.P.</given-names>
            </name>
            <name>
              <surname>Cirulli</surname>
              <given-names>E.T.</given-names>
            </name>
            <name>
              <surname>Fellay</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Dickson</surname>
              <given-names>S.P.</given-names>
            </name>
            <name>
              <surname>Gumbs</surname>
              <given-names>C.E.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>The characterization of twenty sequenced human genomes</article-title>
          <source>PLoS Genet.</source>
          <year>2010</year>
          <volume>6</volume>
          <fpage>e1001111</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pgen.1001111</pub-id>
        </citation>
      </ref>
      <ref id="B46-genes-03-00545">
        <label>46.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ye</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Schulz</surname>
              <given-names>M.H.</given-names>
            </name>
            <name>
              <surname>Long</surname>
              <given-names>Q.</given-names>
            </name>
            <name>
              <surname>Apweiler</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Ning</surname>
              <given-names>Z.</given-names>
            </name>
          </person-group>
          <article-title>Pindel: A pattern growth approach to detect break points of large deletions and medium sized insertions from paired-end short reads</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>2865</fpage>
          <lpage>2871</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btp394</pub-id><pub-id pub-id-type="pmid">19561018</pub-id></citation>
      </ref>
      <ref id="B47-genes-03-00545">
        <label>47.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Xie</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Tammi</surname>
              <given-names>M.T.</given-names>
            </name>
          </person-group>
          <article-title>Cnv-seq, a new method to detect copy number variation using high-throughput sequencing</article-title>
          <source>BMC Bioinformatics</source>
          <year>2009</year>
          <volume>10</volume>
          <fpage>80</fpage>
          <pub-id pub-id-type="doi">10.1186/1471-2105-10-80</pub-id>
        </citation>
      </ref>
      <ref id="B48-genes-03-00545">
        <label>48.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Medvedev</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Fiume</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Dzamba</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Smith</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Brudno</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Detecting copy number variation with mated short reads</article-title>
          <source>Genome Res.</source>
          <year>2010</year>
          <volume>20</volume>
          <fpage>1613</fpage>
          <lpage>1622</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.106344.110</pub-id>
        </citation>
      </ref>
      <ref id="B49-genes-03-00545">
        <label>49.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Nielsen</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Paul</surname>
              <given-names>J.S.</given-names>
            </name>
            <name>
              <surname>Albrechtsen</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Song</surname>
              <given-names>Y.S.</given-names>
            </name>
          </person-group>
          <article-title>Genotype and snp calling from next-generation sequencing data</article-title>
          <source>Nat. Rev. Genet.</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>443</fpage>
          <lpage>451</lpage>
        <pub-id pub-id-type="doi">10.1038/nrg2986</pub-id><pub-id pub-id-type="pmid">21587300</pub-id></citation>
      </ref>
      <ref id="B50-genes-03-00545">
        <label>50.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Wang</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Li</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Hakonarson</surname>
              <given-names>H.</given-names>
            </name>
          </person-group>
          <article-title>Annovar: Functional annotation of genetic variants from high-throughput sequencing data</article-title>
          <source>Nucleic Acids Res.</source>
          <year>2010</year>
          <volume>38</volume>
          <fpage>e164</fpage>
        <pub-id pub-id-type="doi">10.1093/nar/gkq603</pub-id><pub-id pub-id-type="pmid">20601685</pub-id></citation>
      </ref>
      <ref id="B51-genes-03-00545">
        <label>51.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ge</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Ruzzo</surname>
              <given-names>E.K.</given-names>
            </name>
            <name>
              <surname>Shianna</surname>
              <given-names>K.V.</given-names>
            </name>
            <name>
              <surname>He</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Pelak</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Heinzen</surname>
              <given-names>E.L.</given-names>
            </name>
            <name>
              <surname>Need</surname>
              <given-names>A.C.</given-names>
            </name>
            <name>
              <surname>Cirulli</surname>
              <given-names>E.T.</given-names>
            </name>
            <name>
              <surname>Maia</surname>
              <given-names>J.M.</given-names>
            </name>
            <name>
              <surname>Dickson</surname>
              <given-names>S.P.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Sva: Software for annotating and visualizing sequenced human genomes</article-title>
          <source>Bioinformatics</source>
          <year>2011</year>
          <volume>27</volume>
          <fpage>1998</fpage>
          <lpage>2000</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btr317</pub-id><pub-id pub-id-type="pmid">21624899</pub-id></citation>
      </ref>
      <ref id="B52-genes-03-00545">
        <label>52.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Neale</surname>
              <given-names>B.M.</given-names>
            </name>
            <name>
              <surname>Rivas</surname>
              <given-names>M.A.</given-names>
            </name>
            <name>
              <surname>Voight</surname>
              <given-names>B.F.</given-names>
            </name>
            <name>
              <surname>Altshuler</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Devlin</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Orho-Melander</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Kathiresan</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Purcell</surname>
              <given-names>S.M.</given-names>
            </name>
            <name>
              <surname>Roeder</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Daly</surname>
              <given-names>M.J.</given-names>
            </name>
          </person-group>
          <article-title>Testing for an unusual distribution of rare variants</article-title>
          <source>PLoS Genet.</source>
          <year>2011</year>
          <volume>7</volume>
          <fpage>e1001322</fpage>
        <pub-id pub-id-type="doi">10.1371/journal.pgen.1001322</pub-id><pub-id pub-id-type="pmid">21408211</pub-id></citation>
      </ref>
      <ref id="B53-genes-03-00545">
        <label>53.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Adzhubei</surname>
              <given-names>I.A.</given-names>
            </name>
            <name>
              <surname>Schmidt</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Peshkin</surname>
              <given-names>L.</given-names>
            </name>
            <name>
              <surname>Ramensky</surname>
              <given-names>V.E.</given-names>
            </name>
            <name>
              <surname>Gerasimova</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Bork</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Kondrashov</surname>
              <given-names>A.S.</given-names>
            </name>
            <name>
              <surname>Sunyaev</surname>
              <given-names>S.R.</given-names>
            </name>
          </person-group>
          <article-title>A method and server for predicting damaging missense mutations</article-title>
          <source>Nat. Methods</source>
          <year>2010</year>
          <volume>7</volume>
          <fpage>248</fpage>
          <lpage>249</lpage>
          <pub-id pub-id-type="doi">10.1038/nmeth0410-248</pub-id>
        </citation>
      </ref>
      <ref id="B54-genes-03-00545">
        <label>54.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Yandell</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Huff</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Hu</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Singleton</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Moore</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Xing</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Jorde</surname>
              <given-names>L.B.</given-names>
            </name>
            <name>
              <surname>Reese</surname>
              <given-names>M.G.</given-names>
            </name>
          </person-group>
          <article-title>A probabilistic disease-gene finder for personal genomes</article-title>
          <source>Genome Res.</source>
          <year>2011</year>
          <volume>21</volume>
          <fpage>1529</fpage>
          <lpage>1542</lpage>
        <pub-id pub-id-type="doi">10.1101/gr.123158.111</pub-id><pub-id pub-id-type="pmid">21700766</pub-id></citation>
      </ref>
      <ref id="B55-genes-03-00545">
        <label>55.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Zhang</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Chiodini</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Badr</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Zhang</surname>
              <given-names>G.</given-names>
            </name>
          </person-group>
          <article-title>The impact of next-generation sequencing on genomics</article-title>
          <source>J. Genet. Genomics</source>
          <year>2011</year>
          <volume>38</volume>
          <fpage>95</fpage>
          <lpage>109</lpage>
        <pub-id pub-id-type="doi">10.1016/j.jgg.2011.02.003</pub-id><pub-id pub-id-type="pmid">21477781</pub-id></citation>
      </ref>
      <ref id="B56-genes-03-00545">
        <label>56.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Torkamani</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Scott-Van Zeeland</surname>
              <given-names>A.A.</given-names>
            </name>
            <name>
              <surname>Topol</surname>
              <given-names>E.J.</given-names>
            </name>
            <name>
              <surname>Schork</surname>
              <given-names>N.J.</given-names>
            </name>
          </person-group>
          <article-title>Annotating individual human genomes</article-title>
          <source>Genomics</source>
          <year>2011</year>
          <volume>98</volume>
          <fpage>233</fpage>
          <lpage>241</lpage>
        <pub-id pub-id-type="doi">10.1016/j.ygeno.2011.07.006</pub-id><pub-id pub-id-type="pmid">21839162</pub-id></citation>
      </ref>
      <ref id="B57-genes-03-00545">
        <label>57.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Mardis</surname>
              <given-names>E.R.</given-names>
            </name>
          </person-group>
          <article-title>The $1,000 genome, the $100,000 analysis?</article-title>
          <source>Genome Med.</source>
          <year>2010</year>
          <volume>2</volume>
          <fpage>84</fpage>
          <pub-id pub-id-type="doi">10.1186/gm205</pub-id>
        </citation>
      </ref>
      <ref id="B58-genes-03-00545">
        <label>58.</label>
        <citation citation-type="book">
          <person-group person-group-type="author">
            <name>
              <surname>Milano</surname>
              <given-names>F.</given-names>
            </name>
          </person-group>
          <article-title>Power System Architecture</article-title>
          <source>Power System Modelling and Scripting</source>
          <person-group person-group-type="editor">
            <name>
              <surname>Milano</surname>
              <given-names>F.</given-names>
            </name>
          </person-group>
          <publisher-name>Springer</publisher-name>
          <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>
          <year>2010</year>
          <fpage>19</fpage>
          <lpage>30</lpage>
        </citation>
      </ref>
      <ref id="B59-genes-03-00545">
        <label>59.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Wang</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Zender</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Jenks</surname>
              <given-names>S.</given-names>
            </name>
          </person-group>
          <article-title>Efficient clustered server-side data analysis workflows using swamp</article-title>
          <source>Earth Sci. Inform.</source>
          <year>2009</year>
          <volume>2</volume>
          <fpage>141</fpage>
          <lpage>155</lpage>
          <pub-id pub-id-type="doi">10.1007/s12145-009-0021-z</pub-id>
        </citation>
      </ref>
      <ref id="B60-genes-03-00545">
        <label>60.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ye</surname>
              <given-names>X.-Q.</given-names>
            </name>
            <name>
              <surname>Wang</surname>
              <given-names>G.-H.</given-names>
            </name>
            <name>
              <surname>Huang</surname>
              <given-names>G.-J.</given-names>
            </name>
            <name>
              <surname>Bian</surname>
              <given-names>X.-W.</given-names>
            </name>
            <name>
              <surname>Qian</surname>
              <given-names>G.-S.</given-names>
            </name>
            <name>
              <surname>Yu</surname>
              <given-names>S.-C.</given-names>
            </name>
          </person-group>
          <article-title>Heterogeneity of mitochondrial membrane potential: A novel tool to isolate and identify cancer stem cells from a tumor mass?</article-title>
          <source>Stem Cell Rev. Rep.</source>
          <year>2011</year>
          <volume>7</volume>
          <fpage>153</fpage>
          <lpage>160</lpage>
          <pub-id pub-id-type="doi">10.1007/s12015-010-9122-9</pub-id>
        </citation>
      </ref>
      <ref id="B61-genes-03-00545">
        <label>61.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Yoo</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Ha</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Chang</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Jung</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Park</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Kim</surname>
              <given-names>Y.</given-names>
            </name>
          </person-group>
          <article-title>Cnvas: Copy number variation analysis system—The analysis tool for genomic alteration with a powerful visualization module</article-title>
          <source>BioChip J.</source>
          <year>2011</year>
          <volume>5</volume>
          <fpage>265</fpage>
          <lpage>270</lpage>
          <pub-id pub-id-type="doi">10.1007/s13206-011-5311-0</pub-id>
        </citation>
      </ref>
      <ref id="B62-genes-03-00545">
        <label>62.</label>
        <citation citation-type="confproc">
          <person-group person-group-type="author">
            <name>
              <surname>Chard</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Onyuksel</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Wei</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Sulakhe</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Madduri</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Foster</surname>
              <given-names>I.</given-names>
            </name>
          </person-group>
          <article-title>Build Grid Enabled Scientific Workflows Using Gravi and Taverna</article-title>
          <source>Proceedings of IEEE Fourth International Conference on eScience2008. eScience '08</source>
          <conf-loc>Indianapolis, IN, USA</conf-loc>
          <conf-date>7–12 December 2008</conf-date>
          <fpage>614</fpage>
          <lpage>619</lpage>
        </citation>
      </ref>
      <ref id="B63-genes-03-00545">
        <label>63.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ludäscher</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Altintas</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Berkley</surname>
              <given-names>C.</given-names>
            </name>
            <name>
              <surname>Higgins</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Jaeger</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Jones</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Lee</surname>
              <given-names>E.A.</given-names>
            </name>
            <name>
              <surname>Tao</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Zhao</surname>
              <given-names>Y.</given-names>
            </name>
          </person-group>
          <article-title>Scientific workflow management and the kepler system</article-title>
          <source>Concurr. Comput. Pract. Exp.</source>
          <year>2006</year>
          <volume>18</volume>
          <fpage>1039</fpage>
          <lpage>1065</lpage>
        <pub-id pub-id-type="doi">10.1002/cpe.994</pub-id></citation>
      </ref>
      <ref id="B64-genes-03-00545">
        <label>64.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Goecks</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Nekrutenko</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Taylor</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Team</surname>
              <given-names>T.G.</given-names>
            </name>
          </person-group>
          <article-title>Galaxy: A comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences</article-title>
          <source>Genome Biol.</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>R86</fpage>
        <pub-id pub-id-type="doi">10.1186/gb-2010-11-8-r86</pub-id><pub-id pub-id-type="pmid">20738864</pub-id></citation>
      </ref>
      <ref id="B65-genes-03-00545">
        <label>65.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Dinov</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Torri</surname>
              <given-names>F.</given-names>
            </name>
            <name>
              <surname>Macciardi</surname>
              <given-names>F.</given-names>
            </name>
            <name>
              <surname>Petrosyan</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Liu</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Zamanyan</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Eggert</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Pierce</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Genco</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Knowles</surname>
              <given-names>J.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Applications of the pipeline environment for visual informatics and genomics computations</article-title>
          <source>BMC Bioinformatics</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>304</fpage>
          <pub-id pub-id-type="doi">10.1186/1471-2105-12-304</pub-id>
        </citation>
      </ref>
      <ref id="B66-genes-03-00545">
        <label>66.</label>
        <citation citation-type="book">
          <person-group person-group-type="author">
            <name>
              <surname>Taylor</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Shields</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Wang</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Harrison</surname>
              <given-names>A.</given-names>
            </name>
          </person-group>
          <article-title>The Triana Workflow Environment: Architecture and Applications</article-title>
          <source>Workflows for e-Science</source>
          <person-group person-group-type="editor">
            <name>
              <surname>Taylor</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Deelman</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Gannon</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Shields</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <publisher-name>Springer</publisher-name>
          <publisher-loc>Secaucus, NJ, USA</publisher-loc>
          <year>2007</year>
          <fpage>320</fpage>
          <lpage>339</lpage>
        </citation>
      </ref>
      <ref id="B67-genes-03-00545">
        <label>67.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Kwon</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Shigemoto</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Kuwana</surname>
              <given-names>Y.</given-names>
            </name>
            <name>
              <surname>Sugawara</surname>
              <given-names>H.</given-names>
            </name>
          </person-group>
          <article-title>Web API for biology with a workflow navigation system</article-title>
          <source>Nucleic Acids Res.</source>
          <year>2009</year>
          <volume>37</volume>
          <fpage>W11</fpage>
          <lpage>W16</lpage>
        <pub-id pub-id-type="doi">10.1093/nar/gkp300</pub-id><pub-id pub-id-type="pmid">19417067</pub-id></citation>
      </ref>
      <ref id="B68-genes-03-00545">
        <label>68.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Oinn</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Addis</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Ferris</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Marvin</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Senger</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Greenwood</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Carver</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Glover</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Pocock</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Wipat</surname>
              <given-names>A.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Taverna: A tool for the composition and enactment of bioinformatics workflows</article-title>
          <source>Bioinformatics</source>
          <year>2004</year>
          <volume>20</volume>
          <fpage>3045</fpage>
          <lpage>3054</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/bth361</pub-id>
        </citation>
      </ref>
      <ref id="B69-genes-03-00545">
        <label>69.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Schatz</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>The missing graphical user interface for genomics</article-title>
          <source>Genome Biol.</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>128</fpage>
          <pub-id pub-id-type="doi">10.1186/gb-2010-11-8-128</pub-id>
        </citation>
      </ref>
      <ref id="B70-genes-03-00545">
        <label>70.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Dinov</surname>
              <given-names>I.</given-names>
            </name>
            <name>
              <surname>Lozev</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Petrosyan</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Liu</surname>
              <given-names>Z.</given-names>
            </name>
            <name>
              <surname>Eggert</surname>
              <given-names>P.</given-names>
            </name>
            <name>
              <surname>Pierce</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Zamanyan</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Chakrapani</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>van Horn</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Parker</surname>
              <given-names>D.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>Neuroimaging study designs, computational analyses and data provenance using the loni pipeline</article-title>
          <source>PLoS One</source>
          <year>2010</year>
          <volume>5</volume>
          <fpage>e13070</fpage>
        <pub-id pub-id-type="doi">10.1371/journal.pone.0013070</pub-id><pub-id pub-id-type="pmid">20927408</pub-id></citation>
      </ref>
      <ref id="B71-genes-03-00545">
        <label>71.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Rex</surname>
              <given-names>D.E.</given-names>
            </name>
            <name>
              <surname>Ma</surname>
              <given-names>J.Q.</given-names>
            </name>
            <name>
              <surname>Toga</surname>
              <given-names>A.W.</given-names>
            </name>
          </person-group>
          <article-title>The loni pipeline processing environment</article-title>
          <source>Neuroimage</source>
          <year>2003</year>
          <volume>19</volume>
          <fpage>1033</fpage>
          <lpage>1048</lpage>
        <pub-id pub-id-type="doi">10.1016/S1053-8119(03)00185-X</pub-id><pub-id pub-id-type="pmid">12880830</pub-id></citation>
      </ref>
      <ref id="B72-genes-03-00545">
        <label>72.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Service</surname>
              <given-names>R.F.</given-names>
            </name>
          </person-group>
          <article-title>Gene sequencing. The race for the $1000 genome</article-title>
          <source>Science</source>
          <year>2006</year>
          <volume>311</volume>
          <fpage>1544</fpage>
          <lpage>1546</lpage>
          <pub-id pub-id-type="doi">10.1126/science.311.5767.1544</pub-id>
        </citation>
      </ref>
      <ref id="B73-genes-03-00545">
        <label>73.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Mardis</surname>
              <given-names>E.R.</given-names>
            </name>
          </person-group>
          <article-title>Next-generation DNA sequencing methods</article-title>
          <source>Annu. Rev. Genomics Hum. Genet.</source>
          <year>2008</year>
          <volume>9</volume>
          <fpage>387</fpage>
          <lpage>402</lpage>
          <pub-id pub-id-type="doi">10.1146/annurev.genom.9.081307.164359</pub-id>
        </citation>
      </ref>
      <ref id="B74-genes-03-00545">
        <label>74.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Mardis</surname>
              <given-names>E.R.</given-names>
            </name>
          </person-group>
          <article-title>The impact of next-generation sequencing technology on genetics</article-title>
          <source>Trends Genet.</source>
          <year>2008</year>
          <volume>24</volume>
          <fpage>133</fpage>
          <lpage>141</lpage>
          <pub-id pub-id-type="doi">10.1016/j.tig.2007.12.007</pub-id>
        </citation>
      </ref>
      <ref id="B75-genes-03-00545">
        <label>75.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Li</surname>
              <given-names>H.</given-names>
            </name>
            <name>
              <surname>Handsaker</surname>
              <given-names>B.</given-names>
            </name>
            <name>
              <surname>Wysoker</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Fennell</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Ruan</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Homer</surname>
              <given-names>N.</given-names>
            </name>
            <name>
              <surname>Marth</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Abecasis</surname>
              <given-names>G.</given-names>
            </name>
            <name>
              <surname>Durbin</surname>
              <given-names>R.</given-names>
            </name>
            <name>
              <surname>Subgroup</surname>
              <given-names>G.P.D.P.</given-names>
            </name>
          </person-group>
          <article-title>The sequence alignment/map format and samtools</article-title>
          <source>Bioinformatics</source>
          <year>2009</year>
          <volume>25</volume>
          <fpage>2078</fpage>
          <lpage>2079</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btp352</pub-id><pub-id pub-id-type="pmid">19505943</pub-id></citation>
      </ref>
      <ref id="B76-genes-03-00545">
        <label>76.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Leung</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Parker</surname>
              <given-names>D.S.</given-names>
            </name>
            <name>
              <surname>Cunha</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Dinov</surname>
              <given-names>I.D.</given-names>
            </name>
            <name>
              <surname>Toga</surname>
              <given-names>A.W.</given-names>
            </name>
          </person-group>
          <article-title>Irma: An image registration meta-algorithm—Evaluating alternative algorithms with multiple metrics</article-title>
          <source>Lect. Notes Comput. Sci.</source>
          <year>2008</year>
          <volume>5069</volume>
          <fpage>612</fpage>
          <lpage>617</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-540-69497-7_46</pub-id>
        </citation>
      </ref>
      <ref id="B77-genes-03-00545">
        <label>77.</label>
        <citation citation-type="thesis">
          <person-group person-group-type="author">
            <name>
              <surname>Leung</surname>
              <given-names>K.T.K.</given-names>
            </name>
          </person-group>
          <article-title>Principal Ranking Meta-Algorithms</article-title>
          <comment>Ph.D. these, University of California, Los Angeles, CA, USA, 2011.</comment>
        </citation>
      </ref>
      <ref id="B78-genes-03-00545">
        <label>78.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Rex</surname>
              <given-names>D.E.</given-names>
            </name>
            <name>
              <surname>Shattuck</surname>
              <given-names>D.W.</given-names>
            </name>
            <name>
              <surname>Woods</surname>
              <given-names>R.P.</given-names>
            </name>
            <name>
              <surname>Narr</surname>
              <given-names>K.L.</given-names>
            </name>
            <name>
              <surname>Luders</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Rehm</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Stolzner</surname>
              <given-names>S.E.</given-names>
            </name>
            <name>
              <surname>Rottenberg</surname>
              <given-names>D.E.</given-names>
            </name>
            <name>
              <surname>Toga</surname>
              <given-names>A.W.</given-names>
            </name>
          </person-group>
          <article-title>A meta-algorithm for brain extraction in mri</article-title>
          <source>NeuroImage</source>
          <year>2004</year>
          <volume>23</volume>
          <fpage>625</fpage>
          <lpage>637</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neuroimage.2004.06.019</pub-id>
        </citation>
      </ref>
      <ref id="B79-genes-03-00545">
        <label>79.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Ruffalo</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>LaFramboise</surname>
              <given-names>T.</given-names>
            </name>
            <name>
              <surname>Koyutürk</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of algorithms for next-generation sequencing read alignment</article-title>
          <source>Bioinformatics</source>
          <year>2011</year>
          <volume>27</volume>
          <fpage>2790</fpage>
          <lpage>2796</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btr477</pub-id>
        </citation>
      </ref>
      <ref id="B80-genes-03-00545">
        <label>80.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>McKenna</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Hanna</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Banks</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Sivachenko</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Cibulskis</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Kernytsky</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Garimella</surname>
              <given-names>K.</given-names>
            </name>
            <name>
              <surname>Altshuler</surname>
              <given-names>D.</given-names>
            </name>
            <name>
              <surname>Gabriel</surname>
              <given-names>S.</given-names>
            </name>
            <name>
              <surname>Daly</surname>
              <given-names>M.</given-names>
            </name>
            <etal/>
          </person-group>
          <article-title>The genome analysis toolkit: A mapreduce framework for analyzing next-generation DNA sequencing data</article-title>
          <source>Genome Res.</source>
          <year>2010</year>
          <volume>20</volume>
          <fpage>1297</fpage>
          <lpage>1303</lpage>
          <pub-id pub-id-type="doi">10.1101/gr.107524.110</pub-id>
        </citation>
      </ref>
      <ref id="B81-genes-03-00545">
        <label>81.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Gunter</surname>
              <given-names>C.</given-names>
            </name>
          </person-group>
          <article-title>Genomics: A picture worth 1000 genomes</article-title>
          <source>Nat. Rev. Genet.</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>814</fpage>
          <pub-id pub-id-type="doi">10.1038/nrg2906</pub-id>
        </citation>
      </ref>
      <ref id="B82-genes-03-00545">
        <label>82.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Fiume</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Williams</surname>
              <given-names>V.</given-names>
            </name>
            <name>
              <surname>Brook</surname>
              <given-names>A.</given-names>
            </name>
            <name>
              <surname>Brudno</surname>
              <given-names>M.</given-names>
            </name>
          </person-group>
          <article-title>Savant: Genome browser for high-throughput sequencing data</article-title>
          <source>Bioinformatics</source>
          <year>2010</year>
          <volume>26</volume>
          <fpage>1938</fpage>
          <lpage>1944</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btq332</pub-id><pub-id pub-id-type="pmid">20562449</pub-id></citation>
      </ref>
      <ref id="B83-genes-03-00545">
        <label>83.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Hamada</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Wijaya</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Frith</surname>
              <given-names>M.C.</given-names>
            </name>
            <name>
              <surname>Asai</surname>
              <given-names>K.</given-names>
            </name>
          </person-group>
          <article-title>Probabilistic alignments with quality scores: An application to short-read mapping toward accurate snp/indel detection</article-title>
		  
          <source>Bioinformatics</source>
          <year>2011</year>
		  <pub-id pub-id-type="doi">10.1093/bioinformatics/btr537</pub-id>
        </citation>
      </ref>
      <ref id="B84-genes-03-00545">
        <label>84.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Hamada</surname>
              <given-names>M.</given-names>
            </name>
            <name>
              <surname>Wijaya</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Frith</surname>
              <given-names>M.C.</given-names>
            </name>
            <name>
              <surname>Asai</surname>
              <given-names>K.</given-names>
            </name>
          </person-group>
          <article-title>Probabilistic alignments with quality scores: An application to short-read mapping toward accurate snp/indel detection</article-title>
          <source>Bioinformatics</source>
          <year>2011</year>
          <volume>27</volume>
          <fpage>3085</fpage>
          <lpage>3092</lpage>
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btr537</pub-id><pub-id pub-id-type="pmid">21976422</pub-id></citation>
      </ref>
      <ref id="B85-genes-03-00545">
        <label>85.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Raffan</surname>
              <given-names>E.</given-names>
            </name>
            <name>
              <surname>Semple</surname>
              <given-names>R.K.</given-names>
            </name>
          </person-group>
          <article-title>Next generation sequencing—Implications for clinical practice</article-title>
          <source>Br. Med. Bull.</source>
          <year>2011</year>
          <volume>99</volume>
          <fpage>53</fpage>
          <lpage>71</lpage>
          <pub-id pub-id-type="doi">10.1093/bmb/ldr029</pub-id>
        </citation>
      </ref>
      <ref id="B86-genes-03-00545">
        <label>86.</label>
        <citation citation-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Haas</surname>
              <given-names>J.</given-names>
            </name>
            <name>
              <surname>Katus</surname>
              <given-names>H.A.</given-names>
            </name>
            <name>
              <surname>Meder</surname>
              <given-names>B.</given-names>
            </name>
          </person-group>
          <article-title>Next-generation sequencing entering the clinical arena</article-title>
          <source>Mol. Cell. Probes</source>
          <year>2011</year>
          <volume>25</volume>
          <fpage>206</fpage>
          <lpage>211</lpage>
        <pub-id pub-id-type="doi">10.1016/j.mcp.2011.08.005</pub-id><pub-id pub-id-type="pmid">21914469</pub-id></citation>
      </ref>
    </ref-list>
<app-group>
<app>
<title>Supplementary Files</title>
<supplementary-material xmlns:xlink="http://www.w3.org/1999/xlink" id="genes-03-00545-s001" xlink:href="genes-03-00545-s001.zip">
<label>Supplementary File 1</label>
<caption>
<p>ZIP-Document (ZIP, 7447 KB)</p>
</caption>
</supplementary-material>
</app>
</app-group>
  </back>
  </article>
