-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathreport.tex
412 lines (331 loc) · 44.2 KB
/
report.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
\documentclass[journal,transmag]{IEEEtran}
% ..........................................................................
% Packages, configuration settings, and macro definitions.
\usepackage[pdftex]{graphicx}
\graphicspath{figures}
\DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\usepackage[pdftex,rgb,dvipsnames,svgnames,hyperref,table]{xcolor}
\usepackage[pdftex,breaklinks=true,colorlinks=true,
bookmarks=false,pdfhighlight=/O,
urlcolor=DarkBlue,citecolor=DarkRed,linkcolor=DarkBlue]{hyperref}
\usepackage[cmex10]{amsmath}
\interdisplaylinepenalty=2500
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{accsupp}
\usepackage{array}
\usepackage[caption=false,font=footnotesize]{subfig}
\usepackage{booktabs}
\usepackage{xspace}
\usepackage{soul}
\usepackage{url}
\usepackage{hyphenat}
\usepackage[english]{babel}
\usepackage[square,numbers,sort&compress]{natbib}
\usepackage{booktabs}
% Correct bad hyphenation here
\hyphenation{}
\newcommand{\email}[1]{\href{mailto:#1}{#1}}
\usepackage{pdfcomment}
\newcommand{\comment}[3]{\pdfmarkupcomment[markup=Highlight,color=yellow,author={#2}]{#1}{#3}}
\usepackage{tabularx}
% ..........................................................................
% Body.
\begin{document}
\nocite{IEEEBSTcontrol}
\markboth{IEEE Transactions on Biomedical Engineering}{Standards for whole-cell modeling}
\title{Toward standards for tomorrow's whole-cell models}
\author{
Dagmar Waltemath$^{*,+}$,
Jonathan R. Karr$^+$,
Frank T. Bergmann,
Vijayalakshmi Chelliah,
Michael Hucka,\\
Marcus Krantz,
Wolfram Liebermeister,
Pedro Mendes,
Chris J. Myers,~\IEEEmembership{Senior~Member,~IEEE,}
Pinar Pir,\\
%%%%%%%%%%%%%%%%%%%
Begum Alaybeyoglu,
Naveen K Aranganathan,
Kambiz Baghalian,
Arne T. Bittig,
Paulo E. Pinto Burke, \\
Matteo Cantarelli,
Yin Hoon Chew,
Rafael S. Costa,
Joseph Cursons,
Tobias Czauderna,
% Muhammad Haseeb,
Harold F. G{\'o}mez,\\
Jens Hahn,
Tuure Hameri,
Denis Kazakiewicz,
Ilya Kiselev,
Vincent Knight-Schrijver,
Christian Kn\"{u}pfer, \\
Matthias K\"{o}nig,
Daewon Lee,
Audald Lloret-Villas,
Nikita Mandrik,
J. Kyle Medley,
Bertrand Moreau, \\
Hojjat Naderi-Meshkin,
Sucheendra K. Palaniappan,
Daniel Priego-Espinosa,
Martin Scharm, \\
Mahesh Sharma,
Kieran Smallbone,
Natalie Stanford,
Je-Hoon Song,
Tom Theile,
Milenko Tokic, \\
Namrata Tomar,
Jannis Uhlendorf,
Thawfeek M Varusai,
Florian Wendland,
Markus Wolfien, \\
James T. Yurkovich,
Yan Zhu,
Argyris Zardilis,
Anna Zhukova, and
Falk Schreiber$^+$\\
\thanks{
Manuscript received XXX XX, 2015; revised XXX XX, 201X; accepted XXX XX, 201X. Date of publication XXX XX, 201X; date of current version XXX XX, 201X.
This work was supported in part by the Volkswagen Foundation (Grant to D. W. and F. S.) and the James S. McDonnell Foundation (Postdoctoral Fellowship Award in Studying Complex Systems to J. R. K.).
\textit{Asterisk indicates corresponding author: Dagmar Waltemath \email{[email protected]}. Plus indicates these authors contributed equally to this work.}
}
\thanks{$^*$D. Waltemath, A. T. Bittig, M. Scharm, T. Theile, F. Wendland and M. Wolfien are with the Institute of Computer Science, University of Rostock, 18051 Rostock, Germany%(e-mail: \email{[email protected]}; \email{[email protected]}; \email{[email protected]}; \email{[email protected]}; \email{[email protected]}; \email{[email protected]})
.}
\thanks{J.~R. Karr is with the Department of Genetics \& Genomic Sciences, Icahn School of Medicine at Mount Sinai, New York, NY 10029, USA%(e-mail: \email{[email protected]})
.}
\thanks{C.~J. Myers is with the Department of Electrical and Computer Engineering, University of Utah, Salt Lake City, Utah 84112, USA%(e-mail: \email{[email protected]})
.}
\thanks{M. Hucka is with the Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, CA 91125, USA%(e-mail: \email{[email protected]})
.}
\thanks{F.~T. Bergmann is with BioQuant, University of Heidelberg, 69120 Heidelberg, Germany% (e-mail: \email{[email protected]})
.}
\thanks{N.~K. Aranganathan, A. Lloret-Villas and V. Chelliah are with the European Bioinformatics Institute (EMBL-EBI), European Molecular Biology Laboratory, Cambridge CB10 1SD, UK% (e-mail: \email{[email protected], [email protected], [email protected]})
.}
\thanks{J. Hahn, M. Krantz and J. Uhlendorf are with the Department of Biology, Humboldt University of Berlin, 10115 Berlin, Germany% (e-mail: \email{[email protected]}; \email{[email protected]})
.}
\thanks{W. Liebermeister and M. K\"{o}nig are with the Institute of Biochemistry, University Medicine Charit\'{e} Berlin, 10117 Berlin, Germany% (e-mail: \email{[email protected]}; \email{[email protected]})
.}
\thanks{P. Mendes is with the Manchester Institute of Biotechnology and the School of Computer Science, University of Manchester, Manchester M1 7DN, UK and also with the Center for Quantitative Medicine and the Department of Cell Biology, University of Connecticut Health Center, Farmington, CT 06030, USA% (e-mail: \email{[email protected]})
.}
\thanks{P. Pir and V. Knight-Schrijver are with the Babraham Institute, Cambridge CB22 3AT, UK% (e-mail: \email{[email protected]})
.}
\thanks{B. Alaybeyoglu is with the Department of Chemical Engineering, Bo\v{g}azi\c{c}i University, Bebek 34342, Turkey% (e-mail: \email{[email protected]})
.}
\thanks{K. Baghalian is with the Department of Plant Sciences, University of Oxford, South Parks Road, Oxford, UK% (e-mail: \email{[email protected]}
.}
\thanks{P.~E. Pinto Burke is with the Institute of Science and Technology, Federal University of S{\~a}o Paulo, Brazil.}
\thanks{Y.~H. Chew is with the Centre for Synthetic and Systems Biology, University of Edinburgh, Edinburgh EH9 3BF, UK% (e-mail: \email{[email protected]})
.}
\thanks{M. Cantarelli is with OpenWorm% (e-mail: \email{[email protected]})
.}
\thanks{R.~S. Costa is with the Centre of Intelligent Systems-IDMEC, Instituto Superior T{\'e}cnico, University of Lisbon, 1049-001 Lisboa, Portugal% (e-mail: \email{[email protected]})
.}
\thanks{J. Cursons is with the Department of Biomedical Engineering, School of Engineering, University of Melbourne, Parkville, VIC 3010, Australia% (e-mail: \email{[email protected]})
.}
\thanks{T. Czauderna is with the Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia% (e-mail: \email{[email protected]})
.}
\thanks{H.~F. G{\'o}mez is with the Department of Biosystems Science and Engineering, ETH Z{\"u}rich, Basel, Switzerland% (e-mail: \email{[email protected]}
.}
\thanks{T. Hameri and M. Tokic are with the Laboratory of Computational Systems Biotechnology (LCSB), Swiss Federal Institute of Technology (EPFL), CH-1015 Lausanne, Switzerland. M. Tokic is also with the Swiss Institute of Bioinformatics (SIB), CH-1015 Switzerland% (e-mail: \email{[email protected]}; \email{[email protected]})
.}
% \thanks{M. Haseeb is with the Department of Bioinformatics, Mohammad Ali Jinnah University, Islamabad, Pakistan.}
\thanks{D. Kazakiewicz is with the Center for Statistics, Universiteit Hasselt, Hasselt BE3500, Belgium, and also with the Center for Innovative Research, Medical University of Bia\l{}ystok, Bia\l{}ystok 15-089, Poland% (e-mail: \email{[email protected]})
.}
\thanks{I. Kiselev is with the Design Technological Institute of Digital Techniques, Siberian Branch of the Russian Academy of Sciences, Novosibirsk 630090, Russia.}
\thanks{C. Kn\"{u}pfer is with the Institut f\"ur Informatik, University of Jena, 07743 Jena, Germany% (e-mail: \email{[email protected]})
.}
\thanks{D. Lee and J.-H. Song are with the Department of Bio and Brain Engineering, Korea Advanced Institute of Science and Technology, Daejeon 305-701, Republic of Korea% (e-mail: \email{[email protected]})
.}
\thanks{N. Mandrik is with the Sobolev Institute of Mathematics, Siberian Branch of the Russian Academy of Sciences, Novosibirsk 630090, Russia% (e-mail: \email{[email protected]})
.}
\thanks{J.~K. Medley is with the Department of Bioengineering, University of Washington, Seattle, WA 98195, USA% (e-mail: \email{[email protected]})
.}
\thanks{H. Naderi-Meshkin is with the Stem Cell and Regenerative Medicine Research Department, Iranian Academic Center for Education, Culture Research (ACECR), Khorasan Razavi Branch, Mashhad, Iran% (e-mail: \email{[email protected]})
.}
\thanks{B. Moreau is with the CoSMo Company, Lyon, France% (e-mail: \email{[email protected]})
.}
\thanks{S.~K. Palaniappan is with the Rennes - Bretagne Atlantique Research Centre, Institute for Research in Computer Science and Automation, 35042 Rennes Cedex, France.}
\thanks{D.~Priego-Espinosa is with the Instituto de Ciencias F{\'i}sicas, Universidad Nacional Aut{\'o}noma de M{\'e}xico, M{\'e}xico.}
\thanks{M. Sharma is with the Department of Pharmacoinformatics, National Institute of Pharmaceutical Education and Research, Punjab 160062, India.}
\thanks{K. Smallbone and N. Stanford are with the Manchester Centre for Integrative Systems Biology, University of Manchester, Manchester M1 7DN, UK% (e-mail: \email{[email protected]}; \email{[email protected]})
.}
\thanks{N. Tomar is with the Department of Dermatology, University Medicine, Friedrich-Alexander University of Erlangen-N\"urnberg, Erlangen, Germany.}
\thanks{T.~M. Varusai is with the Department of Systems Biology Ireland, University College Dublin, Belfield, Dublin 4, Ireland% (e-mail: \email{[email protected]})
.}
\thanks{J. T. Yurkovich is with the Department of Bioengineering, University of California, San Diego, La Jolla, CA 92093, USA% (e-mail: \email{[email protected]})
.}
\thanks{Y. Zhu is with Monash Institute of Pharmaceutical Sciences, Monash University, Parkville, VIC 3052, Australia.}
\thanks{A. Zardilis is with the Centre for Synthetic and Systems Biology, University of Edinburgh, UK.}
\thanks{A. Zhukova is with the Institut de Biochimie et G\'en\'etique Cellulaires, National Center for Scientific Research, and also with the University of Bordeaux, France, 33077 Bordeaux Cedex, France% (e-mail: \email{[email protected]})
.}
\thanks{F. Schreiber is with the Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia and also with the Institute of Computer Science, Martin Luther University Halle-Wittenberg, 06108 Halle, Germany% (e-mail: \email{[email protected]})
.}
\thanks{Digital Object Identifier 10.1109/TBME.XXXX.XXXXXXX}
}
\maketitle
\begin{abstract}
Whole-cell modeling is a promising tool for biological research, bioengineering, and medicine.
However, substantial work remains to create complete, accurate, and reproducible models.
Among the advances needed are a strong theoretical understanding of multi-algorithm modeling, standardized modeling languages, and an efficient general-purpose simulator.
We organized the 2015 Whole-Cell Modeling Summer School to teach whole-cell modeling, as well as to evaluate the need for new modeling standards and tools by encoding a recently published whole-cell model in SBML.
We describe several standards extensions, software tools, and databases which are needed to facilitate reproducible whole-cell modeling, including a graphical model editor, a multi-algorithm simulator, and several SBGN extensions.
Together these new standard extensions and software tools could accelerate whole-cell modeling.
\end{abstract}
\begin{IEEEkeywords}
Whole-cell modeling, Systems biology, Computational biology, Simulation, Standards, Education
\end{IEEEkeywords}
\IEEEpeerreviewmaketitle
\section{Introduction}
\IEEEPARstart{O}{ver} the past twenty years, computational modeling has become an essential and powerful tool for biological research, bioengineering, and medicine to analyze high-throughput molecular measurements and understand the molecular details of complex biological systems. Computational modeling has been used to identify new metabolic genes~\cite{Reed2006}, to engineer metabolic pathways in bacteria~\cite{Lee2012}, and to identify potential new antimicrobial drug targets~\cite{Lee2009}.
Computational models also have the potential to enable bioengineers to design new microorganisms for industrial applications such as chemical synthesis, biofuel production, and waste decontamination, as well as to enable clinicians to tailor therapy to individual patients. Realizing this potential requires more comprehensive and accurate computational models that are capable of predicting cellular behavior from genotype. Realizing this potential also requires improved simulation tools, as well as standardized tools for storing and exchanging models, simulation experiments, and visualizations~\cite{Macklin2014,Karr2015,Karr2015b,hucka2015promoting,Klipp07,path2models2013}.
Recently, researchers at Stanford University developed the first whole-cell model of the gram-positive bacterium \textit{Mycoplasma genitalium}~\cite{Karr2012}. The model represents the life cycle of a single Mycoplasma cell including the copy number dynamics of each metabolite, RNA, and protein species and accounts for every known gene function. The model is composed of 28 submodels, each of which is implemented using different mathematical formalisms including \emph{ordinary differential equations} (ODEs), \emph{flux balance analysis} (FBA), and \emph{Boolean rules} (BRs), and trained using different experimental data.
The model is implemented in MATLAB, is available open-source under the MIT license~\cite{wholeCell}, and is extensively documented. This has enabled other researchers to use the model to conduct \textit{in silico} experiments~\cite{Sanghvi2013, Purcell2013, Kazakiewicz2015}.
Despite extensive documentation, this whole-cell model software is difficult to use. The software is large, complex, and time-consuming to learn. The software also cannot easily be reused to simulate other models because many of the model details are intertwined with the software. In addition, many academic researchers and many software developers cannot use the model software because MATLAB is proprietary and expensive. Compared to other languages with larger development communities such as Python, MATLAB has also few packages and development tools. In particular, MATLAB has limited support for object-oriented programming. Furthermore, the model software is not optimally efficient because MATLAB is just-in-time compiled and dynamically typed.
Covert and colleagues developed the WholeCellKB~\cite{Karr2013}, WholeCellSimDB~\cite{Karr2014}, and WholeCellViz~\cite{Lee2013} software tools to provide user-friendly interfaces to their modeling software. However, significant domain expertise is still required to use their modeling software, particularly to construct new models. Alternative approaches are needed to enable more researchers to develop and simulate their own whole-cell models. Software-independent standards and other open-source software tools have the potential to enable researchers to develop such models more quickly, to explore them more deeply, and to evaluate them more rigorously. Furthermore, standards would make whole-cell models more reusable and comparable, as well as more searchable and retrievable through model repositories such as BioModels~\cite{juty2015biomodels,chelliah2015biomodels}.
Several software-agnostic systems biology standards have already been developed by the \emph{COmputational Modeling in BIology NEtwork} (COMBINE)~\cite{le2011meeting}, including the \emph{Systems Biology Markup Language} (SBML)~\cite{hucka2003}, the \emph{Cell Markup Language} (CellML)~\cite{hedley_2001b}, the \emph{Simulation Experiment Description Markup Language} (SED-ML)~\cite{sedml2011}, and the \emph{Systems Biology Graphical Notation} (SBGN)~\cite{LeNovereHMMSS09} (see Table~\ref{tab:standards}). SBML and CellML are languages for representing mathematical models. CellML focuses on modularly encoding mathematical models, while SBML focuses on describing biological processes. Both support several modeling formalisms including ODEs and FBA. SED-ML is a language for describing computational experiments, including the simulation algorithm and parameter values. SED-ML enables scientists to reproduce simulations. SBGN is a visual notation for describing biological processes. To date, none of these open standards have been used with models as complex as the \textit{M. genitalium} model.
\begin{table*}[tb!]
\caption{\label{tab:standards} Standards and infrastructure used during the summer school.}
\begin{tabularx}{\textwidth}{llX}\toprule
\textbf{Acronym} & \textbf{Name} & \textbf{Description}\\\midrule
%CellML & Cell Markup Language & Standard for describing dynamical models in terms of mathematical relationships\\
COMBINE & COmputational Modeling in BIology NEtwork & Community dedicated to developing standards and associated software tools\\
\hline
SBGN & Systems Biology Graphical Notation & Standard for describing biochemical pathway diagrams \\
SBML Level 3 Core & Systems Biology Markup Language & Standard for describing dynamical models in terms of biochemical processes \\
SBML Comp & SBML Package: Hierarchical Model Composition & Definition of how a model is composed from other models \cite{smith2013sbml}\\
SBML FBC & SBML Package: Flux Balance Constraints & Definition of constraint based models \cite{olivier2013sbml}\\
SBML Multi & SBML Package: Multistate and Multicomponent Species & Representation of entity pools with multiple states and composed of multiple components \cite{waltemath2014meeting}\\
SBML Arrays & SBML Package: Arrays & Support for expressing arrays of things \cite{waltemath2014meeting} \\
SED-ML & Simulation Experiment Description Markup Language & Standard for describing computational experiments\\
\bottomrule
\end{tabularx}
\end{table*}
We organized the 2015 Whole-Cell Modeling Summer School to train students in whole-cell modeling, as well as to evaluate the need for new standards and tools for whole-cell modeling. The majority of the school focused on encoding the \textit{M. genitalium} model using SBML, visualizing it with SBGN, and simulating it with SED-ML. This was both a learning exercise, as well as a tool for evaluating standard representations capabilities for encoding whole-cell models. The ultimate goal of the school was to encode an open-source whole-cell model in SBML, visualize the model using SBGN, simulate the model using open-source software, and use the model to conduct \textit{in silico} experiments encoded in SED-ML. We chose to focus the course on SBML because there was insufficient time to evaluate both SBML and CellML.
Here, we describe the summer school, outline our progress encoding the \textit{M. genitalium} model using standards, and describe several standard extensions and software tools that we believe are needed to support efficient whole-cell modeling.
\section{The 2015 Whole-Cell Modeling Summer School}
We organized the summer school to teach students how to build and encode models using COMBINE standards by encoding the \textit{M. genitalium} model using software-independent, standard representation formats.
\subsection{Organization}
The Whole-Cell Modeling Summer School was held March 9-13, 2015 at the University of Rostock, Germany. The school was organized by Dagmar Waltemath and Falk Schreiber and supported by the Volkswagen Foundation. The school included 43 students, nine instructors, and two organizers.
The school began with two introductory lectures on modeling and modeling standards. Jonathan Karr from the Icahn School of Medicine at Mount Sinai, USA, presented an overview of whole-cell and multi-algorithm modeling. Michael Hucka from the California Institute of Technology, USA, presented an overview of the SBML, SED-ML, and SBGN standards; open-source software tools that support these standards; and the COMBINE initiative. We also organized three discussions on model composition, particle-based state representation, and stochastic modeling.
The majority of the school was devoted to hands-on learning about whole-cell modeling and the COMBINE standards by encoding the \textit{M. genitalium} model in SBML, creating visualizations using SBGN, and defining simulations using SED-ML. The 28 sub-models were divided among nine groups of four to five students and one instructor.
Each day concluded with brief progress reports from each group to exchange ideas and facilitate discussion.
We also organized a poster session and several social activities to encourage students to network.
\subsection{Educational outcome}
Most students reported gaining knowledge of whole-cell modeling, increased appreciation for reproducibility, and increased understanding of the SBML, SED-ML, and SBGN standards. Many students also reported learning about open-source software tools relevant to their own research.
In addition, many of the students reported that the school expanded their network. The school introduced several students to other workshops and job opportunities.
\subsection{Lessons learned for organizing research-based schools}
We learned several valuable lessons about how to best organize a research-based school. First, we learned that research-based schools should have clear background knowledge expectations and learning objectives and have well-planned learning exercises. This helps students make informed decisions about whether to participate in the school, know how to prepare for the school, and learn efficiently.
Second, we found that students greatly enjoy learning through open research problems rather than through prescribed training exercises. This challenges students and engages them in research. This also helps students build practical skills that complement their undergraduate training.
Third, we found that open-ended project-based schools require a high teacher-to-student ratio, a flexible schedule, and multidisciplinary project teams. A high teacher to student ratio allows students to get feedback and iterate through potential solutions quickly. A flexible schedule enables impromptu lectures and discussions. Multidisciplinary teams enable students to work through difficult problems by drawing on perspectives from multiple fields.
\section{Toward an SBML-encoded whole-cell model}
The second goal of the school was to encode the \textit{M. genitalium} whole-cell model in SBML. To achieve this goal, most of the course was devoted to active learning sessions in which students were challenged to encode submodels of the \textit{M. genitalium} in SBML, integrate submodels into a single model, and simulate models using SED-ML. During these sessions, the students and instructors were divided into nine groups. Eight of the groups were tasked with encoding one or more submodels. The ninth group was tasked with developing a standards-compliant scheme to integrate the submodels into a single model. In addition, three instructors helped all of the groups annotate and visualize their submodels and one instructor helped all of the groups encode their submodels in SBML. Table~SI in the supplemental material lists the nine groups and all of the students and instructors.
\subsection{Submodel encoding}
The eight submodel encoding groups pursued various strategies to encode submodels in SBML. Several of the groups encoded submodels by first reading the submodel documentation, then drawing pathway diagrams using software tools such as CellDesigner~\cite{funahashi2008celldesigner} and VANTED~\cite{Rohn2012}, and finally writing scripts to generate SBML models from their diagrams using libSBML~\cite{bornstein_2008}. Other groups used modeling software tools such as Antimony~\cite{Smith2009}, BioUML~\cite{Kolpakov2006}, COBRApy~\cite{ebrahim2013cobrapy}, COPASI~\cite{Mendes2009}, iBioSim~\cite{Madsen2012}, and libRoadRunner~\cite{Somogyi17062015} to encode submodels based on their documentation. A few of the groups encoded submodels by converting the MATLAB code to SBML. These groups then generated SBGN diagrams from their SBML to better understand their submodels.
The groups encountered several challenges to encoding the submodels in SBML. First, understanding the submodels was time-consuming because the documentation only summarizes the submodels, the connection between the submodels and the associated pathway/genome database is unclear, and many of the submodels details are implemented directly in the MATLAB code. Fortunately, one of the principal authors of the \textit{M. genitalium} model participated in the school.
A second challenge to encoding the submodels was to encode serially-executed MATLAB submodels in SBML, because SBML does not explicitly represent sequential operations. Most of the groups decided to tackle this problem by formalizing MATLAB submodels as discrete stochastic models and simulating them using the Gillespie stochastic simulation algorithm~\cite{gillespie1977}. One drawback of this approach is that it required assigning kinetics that are not present in the original MATLAB submodels due to insufficient kinetic data.
A third challenge to encoding the submodels was to encode the randomized algorithms used by the MATLAB submodels in SBML. For example, the MATLAB translation submodel includes a randomized algorithm that assigns amino acids to individual polypeptides. This algorithm is not equivalent to the Gillespie algorithm and cannot easily be encoded in SBML because plain SBML does not support random number generation. Most of the groups also solved this problem by formalizing submodels as stochastic models.
To encode many of the submodels in SBML, the groups also had to either enumerate the particle-based state representation used by the MATLAB submodels, or approximate the MATLAB submodels. The translation group chose to approximate their submodel by eliminating the internal dynamics of the polymerization of each polypeptide. Consequently, their submodel does not track the progress of individual ribosomes or account for base-specific translation rates. The replication, replication initiation, transcription, and transcriptional regulation groups, chose to enumerate the chromosome representation used by the MATLAB model by creating Boolean indicator variables to represent the existence and protein-binding status of each base. This enumerated representation requires millions of variables. Consequently, the corresponding SBML files are computationally expensive to parse and simulate. Enumerating the rules that govern the joint values of the enumerated variables, such as the rules that represent the steric effects of DNA-bound proteins by preventing proteins from binding neighboring bases, is also impractical. Furthermore, editing this enumerated representation is difficult because thousands of variables and rules must be edited rather than a small number of variable and rule patterns.
The lack of SBML simulator support for arrays was another challenge to encoding submodels in SBML. All of the groups overcame the lack of array support by enumerating individual array elements and all matrix algebra computations. This created verbose SBML files that are difficult to interpret, maintain, and edit. Enumerating the matrix algebra computations also increases the computational cost of simulation.
Combined, we found it difficult to encode most of the MATLAB submodels in SBML. As discussed below, future progress in whole-cell modeling would be facilitated by expanded support for the SBML Hierarchical Model Composition, Multistate and Multicomponent Species~\cite{SBMLMulti} and Arrays~\cite{SBMLArrays} packages.
\begin{table*}[tb!]
\caption{\label{tab:new-standards} New and expanded tools needed to facilitate whole-cell modeling.}
\begin{tabularx}{\textwidth}{llX}\toprule
\textbf{Type} & \textbf{Description}\\\midrule
Database & Expanded molecular biological databases such as ChEBI \\
Software & Efficient, parallelized multi-algorithm simulator which supports the SBML Hierarchical Model Composition package \\
Software & Hybrid population/particle simulator which supports the SBML Multistate and Multicomponent Species package \\
Software & Graphical model editor which transparently build models from pathway/genome databases\\
Standard and software & SBGN standard and tool support for hybrid maps containing Process Description, Entity Relationship, and Activity Flow nodes \\
\bottomrule
\end{tabularx}
\end{table*}
\subsection{Model integration}
The integration group was responsible for assembling the submodels into a single model. This included devising a scheme for representing global state variables, defining the interfaces exposed by the submodels to the global state variables, and developing a method to manage concurrent writing of shared state variables by multiple submodels. The integration group defined the global state variables as the union of all state variables shared by at least two submodels rather than by explicitly defining a set of global state variables as done by the original MATLAB-based simulation. The advantages of this approach are that submodel developers are not also required to develop global state variables and that it minimizes the number of global state variables. The disadvantages of this approach are that the total set of variables is less transparent and that it requires users to learn all of the submodels and their naming conventions to analyze model simulations.
The integration group standardized the submodel interfaces by defining a variable naming convention. This convention makes it clear how multiple local submodel variables map onto the same global variable. The integration group used the same variable names as those used by the MATLAB implementation. Matrix and particle-based variables were enumerated by creating multiple variables differentiated by the suffixes.
The primary challenge faced by the integration group was managing concurrent editing of state variables by multiple submodels. The group explored several potential solutions. First, they explored sequentially simulating the submodels and updating the global state variables. This avoids needing to merge variable changes. However, under this approach, submodels are simulated with different variable values within each time step. Consequently, simulation predictions are sensitive to the submodel execution order.
The integration group also explored several more complex solutions that would enable submodels to be simulated with the same variable values within each time step. These strategies included reducing the submodel integration time step so that submodels do not request conflicting variable changes; dividing each of the shared state variables into multiple, independent sub-variables for each submodel, simulating the submodels, and merging the sub-variables to update global values; and using semaphores to manage concurrent variable editing whereby at each time step submodels request sets of atomic state changes and a controller decides which change sets are accepted. Each of these strategies has advantages and disadvantages. The first strategy is simple to understand and implement, but is computationally expensive. The second strategy is simple to implement and computationally efficient for independent variables, but is complex for coupled variables such as those that represent the chromosome protein occupancy. The third strategy is complex, but is more general than the second strategy and more computationally efficient than the first. Ultimately, we concluded that a combination of these strategies will be needed to efficiently manage concurrent writing of different types of shared variables.
The integration group implemented their strategies using SBML and SED-ML. In particular, the group wrote Python scripts to create SBML and SED-ML files automatically from the variable usage information provided by the other teams. The first script created template SBML models for each process to ensure consistent interfaces. The second script created a SED-ML file to implement the coordinated execution of all the current processes and share the variables using the schemes described above. The third script created a hierarchical SBML model that connected the processes in one global top-level file, and it also included separate models to implement variable sharing between the processes. The SED-ML and hierarchical SBML approaches are two alternatives that we decided to explore in parallel. Which one is ultimately used depends on whether one feels the description of the execution of the parallel processes is part of the model or the simulation strategy.
The integration group tested their strategies using iBioSim because iBioSim is one of the few simulators that supports model composition. Another challenge to integrating the submodels was the lack of a multi-algorithm simulator. The integration group plans to overcome this limitation by adding support for multi-algorithm simulation to iBioSim.
\subsection{Annotation, documentation, and visualization}
The annotation, documentation and visualization group was responsible for annotating the model.
The goal was to annotate every model entity with a cross reference to an external database such as ChEBI~\cite{Hastings2013}, as in the case of small molecules, and/or in terms of other model entities, as in the case of chemical reactions.
The group wrote scripts to search molecular biology databases for every entity contained in the \textit{M. genitalium} model.
The main problem faced by the documentation group was that many model entities are not currently represented by any molecular biology database.
This shortcoming can easily be overcome by proposing new ontology terms to the databases.
This problem highlights the need to expand the molecular biology databases to aggregate data on more biological molecules and interactions.
The documentation group also helped the other groups visualize submodels by providing advice on SBGN and diagramming tools such as VANTED~\cite{Rohn2012}.
The main visualization problem encountered by the group was that whole-cell models require large diagrams that must be manually arranged to produce intuitive visualizations.
\subsection{Progress}
We produced preliminary SBML and SBGN versions of most of the \textit{M. genitalium} submodels and finished SBML and SBGN versions for the cytokinesis submodel. Significant work remains to finish encoding the model. We must finish encoding the submodels, expand software tools such as iBioSim to support multi-algorithm modeling, rigorously test the entire SBML model by reproducing all of the MATLAB model tests, and thoroughly diagram and document the SBML model.
\subsection{Future steps}
We hope to finish encoding the \textit{M. genitalium} submodels in SBML and integrating the SBML-encoded submodels into a single model that can be simulated by open-source software tools such as BioUML, COPASI, iBioSim, and libRoadRunner. Several students and instructors have continued to work and meet online. Many also plan to participate in a second meeting in October, 2015, that will be held at the University of Utah, USA, immediately prior to the 2015 COMBINE Forum (\url{http://co.mbine.org/events/COMBINE_2015}).
Going forward, we hope to publish SBML-encoded versions of each of the \textit{M. genitalium} submodels to BioModels, along with SED-ML tests, SBGN diagrams, and textual documentations. This would make the submodels searchable, retrievable, and reusable by other scientists. We believe this would be a valuable community resource. It would demonstrate how to build a whole-cell model and enable other researchers to build upon the \textit{M. genitalium} submodels.
\section{Toward SBML-, SED-ML-, and SBGN-based standards for whole-cell modeling}
The school was the first attempt to encode a whole-cell model using open standards. Thus we were not surprised to learn several limitations with currently-available formats. More importantly, the school generated ideas for how our existing standards and tools can be expanded to better support large, heterogeneous models.
\subsection{Standard extensions}
Several enhancements to simulation software tools and databases are needed to facilitate whole-cell modeling (Table~\ref{tab:new-standards}). First, more SBML simulators must support multi-algorithm simulations. This will require research to determine the best way to integrate heterogeneous submodels, including rigorously evaluating the schemes proposed by the integration group. Significant effort will also be needed to develop an efficient, parallelized, multi-algorithm simulator.
\begin{figure*}[bt!]
\centering
\includegraphics[width=\textwidth]{figure1/figure1.pdf}
\caption{\label{fig:1} Whole-cell modeling pipeline. First, researchers will assemble experimental data into pathway/genome databases. Second, researchers will use pathway/genome databases to construct submodels. Third, researchers will use multi-algorithm simulators to conduct \textit{in silico} experiments. Lastly, researchers will analyze in silico experiments to learn new biology.}
\end{figure*}
Second, more SBML simulators must implement the SBML Multistate and Multicomponent Species package to support hybrid population/particle-based state representation such as that used by BioNetGen \cite{Hlavacek2006, Hogg2014} and NFSim \cite{Sneddon2011}. This would enable more succinct model descriptions, making models much easier to understand, edit, and expand. For example, translation could be described using a single reaction pattern and parameterized by arrays of mRNA-specific translation initiation rates and codon-specific elongation rates rather than by enumerating each reaction. By separating mathematical descriptions from quantitative parameter values, reaction patterns would also make the connection between dynamical models and the experimental data used to inform their parameters more transparent. Implementing the SBML Multistate and Multicomponent Species package would also enable modelers to efficiently simulate models with large, combinatorial state spaces.
New user-friendly graphical editors must also be developed to enable researchers to easily build SBML files with these new features. These graphical editors must also allow researchers to transparently map model parameters onto experimental data organized in pathway/genome databases.
In addition, molecular biology databases such as ChEBI must be expanded to enable researchers to concretely define whole-cell models in terms of external entities.
SBGN must also be expanded in several ways. SBGN should support hybrid diagrams which contain elements from Process Description, Entity Relationship, and Activity Flow. It also needs to support generic reactions to reduce the size of a diagram and the amount of repetitive reaction chains with simple modifications of reactions partners, such as fatty acid synthesis. The SBGN viewers must also be expanded to provide advanced layout algorithms suitable for large visualizations and automatically display diagrams at multiple levels of granularity using contextual zooming and model reduction.
Together, these software, database, and SBGN expansions would enable more researchers to more easily build, manage, simulate, and reproduce whole-cell models and simulations. These new standards and tools would also enable researchers to build more comprehensive and more accurate models. Ultimately, this would enable whole-cell modeling to support bioengineering and personalized medicine.
\subsection{The whole-cell modeling pipeline}
We anticipate that such expanded standards and tools will enable a four step approach to whole-cell model-driven discovery (Fig.~\ref{fig:1}). First, researchers will assemble experimental data from numerous sources including databases such as SABIO-RK~\cite{Wittig2012} and UniProt~\cite{UniProt2015} into pathway/genome databases using software tools such as Pathway Tools~\cite{Karp2010} and WholeCellKB. Second, researchers will use pathway/genome databases and graphical modeling tools such as BioUML, COPASI, and iBioSim to build submodels and encode them using transparent languages such as SBML. Third, multi-algorithm simulators will be used to conduct \textit{in silico} experiments. Lastly, software tools such as WholeCellSimDB and WholeCellViz will be used to discover new biology through exploring, visualizing, and analyzing in silico experiments.
\section{Conclusion}
The 2015 Whole-Cell Modeling Summer School provided 43 young scientists training in whole-cell and multi-algorithm modeling by encoding the \textit{M. genitalium} whole-cell model in SBML. Additional courses are needed to provide students with deeper theoretical training in dynamical modeling, multi-algorithm modeling, model reduction, and parameter estimation, as well as practical training in model construction including data curation, model building, numerical optimization, model testing, and model analysis.
Significant strides were made toward implementing the \textit{M. genitalium} whole-cell model using SBML.
We developed preliminary SBML versions of all of the submodels of the \textit{M. genitalium} model. We have continued to encode the \textit{M. genitalium} model since the school. Ultimately, we hope to publish an SBML-encoded version of the model to BioModels.
In addition, the summer school generated clear goals for expanding the existing SBML software tools to support whole-cell modeling. The SBML simulators must be expanded to support all of the Hierarchical Model Composition, Multistate and Multicomponent Species, Arrays, and Flux Balance Constraints packages to efficiently simulate whole-cell models. Furthermore, the school unified modelers, software developers, and standards developers to develop standardized, open-source tools for whole-cell and other large models.
New parameter estimation, model testing, and visual analysis tools must also be developed to enable researchers to effectively use SBML-encoded whole-cell models for research. In addition, our molecular biology databases must be expanded to facilitate whole-cell model annotation. Furthermore, SBGN and the SBGN viewers must be expanded to support hybrid diagrams, advanced automatic graph layout, automatic graph reduction, and contextual zooming. CellML should also be rigorously evaluated as another potential whole-cell modeling standard.
In summary, we believe that whole-cell modeling has the potential to be an important tool for biological discovery, bioengineering, and medicine. Achieving this potential requires new simulation software for simulating whole-cell models. In turn, this requires expanding the whole-cell modeling field including training young researchers.
\ifCLASSOPTIONcaptionsoff
\newpage
\fi
\bibliographystyle{IEEEtran}
\bibliography{IEEEabrv,report}
% biography section
%
% If you have an EPS/PDF photo (graphicx package needed) extra braces are needed around the contents of the optional argument to biography to prevent
% the LaTeX parser from getting confused when it sees the complicated \includegraphics command within an optional argument. (You could create
% your own custom macro containing the \includegraphics command to make things simpler here.)
\begin{IEEEbiography}[{\includegraphics[width=3.45in,keepaspectratio]{photos/group.jpg}}]{}
~\\
~\\
~\\
~\\
~\\
~\\
~\\
~\\
~\\
~\\
~\\
\textbf{2015 Whole-Cell Modeling Summer School} included the 54 participants listed in Table~SI in the supplemental material.
\end{IEEEbiography}
% You can push biographies down or up by placing a \vfill before or after them. The appropriate
% use of \vfill depends on what kind of text is on the last page and whether or not the columns are being equalized.
\vfill
% Can be used to pull up biographies so that the bottom of the last one is flush with the other column.
%\enlargethispage{-5in}
% ..........................................................................
% End.
%\clearpage
%\setcounter{table}{0}
%\renewcommand{\thetable}{S\Roman{table}}
% Moved into supplemental due to page limitations
\end{document}