home *** CD-ROM | disk | FTP | other *** search
- Path: sparky!uunet!ogicse!hp-cv!sdd.hp.com!ux1.cso.uiuc.edu!roundup.crhc.uiuc.edu!moon.crhc.uiuc.edu!not-for-mail
- From: ymwang@crhc.uiuc.edu (Yi-Min Wang)
- Newsgroups: comp.arch
- Subject: Academic and Commercial Checkpointing
- Keywords: Checkpoint, Rollback Recovery, Parallel and Distributed Systems
- Message-ID: <185e6nINNdv3@moon.crhc.uiuc.edu>
- Date: 3 Sep 92 06:23:51 GMT
- Article-I.D.: moon.185e6nINNdv3
- Organization: Center for Reliable and High-Performance Computing, University of Illinois at Urbana-Champaign
- Lines: 555
- NNTP-Posting-Host: moon.crhc.uiuc.edu
-
- I am collecting the information on "(process state) checkpointing
- in commercial systems". Any information would be highly appreciated.
-
- Yi-Min Wang
- ymwang@crhc.uiuc.edu
-
- p.s. I have also been doing literature survey on the subject of
- "checkpointing and rollback recovery for parallel and distributed systems".
- The following is a list of 55 papers I have got so far. Any additions and
- comments are welcome.
-
- @STRING{ ACM-TOCS = "ACM Trans. on Computer Systems" }
- @STRING{ ACM-SIGOPS = "ACM Operating Systems Review" }
- @STRING{ IEEE-TC = "IEEE Trans. on Computers" }
- @STRING{ IEEE-TPDS = "IEEE Trans. on Parallel and Distributed Systems" }
- @STRING{ IEEE-TSE = "IEEE Trans. on Software Engineering" }
- @STRING{ IPL = "Information Processing Letters" }
- @STRING{ UIUC-CRHC = "Coordinated Science Laboratory,
- University of Illinois at Urbana--Champaign"}
- @STRING{ ICPP = "Proc. Int'l Conf. on Parallel Processing" }
- @STRING{ FTCS = "Proc. IEEE Fault-Tolerant Computing Symposium" }
- @STRING{ SRDS = "Proc. IEEE Symp. on Reliable Distr. Syst." }
- @STRING{ ICDCS = "Proc. IEEE Int'l Conf. on Distributed Computing
- Systems"}
-
- @ARTICLE{1,
- AUTHOR = "B. Randell",
- TITLE = "System structure for software fault tolerance",
- JOURNAL = IEEE-TSE,
- VOLUME = "SE-1",
- NUMBER = 2,
- PAGES = "220--232",
- MONTH = jun,
- YEAR = 1975
- }
-
- @ARTICLE{2,
- AUTHOR = "D. L. Russel",
- TITLE = "State restoration in systems of communicating
- processes",
- JOURNAL = IEEE-TSE,
- VOLUME = "SE-6",
- NUMBER = 2,
- PAGES = "183--194",
- MONTH = mar,
- YEAR = 1980
- }
-
- @ARTICLE{3,
- AUTHOR = "K. H. Kim",
- TITLE = "Approaches to Mechanization of the Conversation
- Scheme Based on Monitors",
- JOURNAL = IEEE-TSE,
- VOLUME = "SE-8",
- NUMBER = 3,
- PAGES = "189--197",
- MONTH = may,
- YEAR = 1982
- }
-
- @ARTICLE{4,
- AUTHOR = "R. Koo and S. Toueg",
- TITLE = "Checkpointing and rollback-recovery for
- distributed systems",
- JOURNAL = IEEE-TSE,
- VOLUME = "SE-13",
- NUMBER = 1,
- PAGES = "23--31",
- MONTH = jan,
- YEAR = 1987
- }
-
- @INPROCEEDINGS{5,
- AUTHOR = "Y. Tamir and C. H. Sequin",
- TITLE = "Error recovery in multicomputers
- using global checkpoints",
- BOOKTITLE = ICPP,
- PAGES = "32--41",
- YEAR = 1984
- }
-
- @INPROCEEDINGS{6,
- AUTHOR = "B. Janssens and W. K. Fuchs",
- TITLE = "Experimental evaluation of multiprocessor cache-based
- error recovery",
- BOOKTITLE = ICPP,
- PAGES = "I-505--I-508",
- YEAR = 1991
- }
-
- @INPROCEEDINGS{7,
- AUTHOR = "Y. M. Wang and W. K. Fuchs",
- TITLE = "Scheduling message processing for reducing rollback
- propagation",
- BOOKTITLE = FTCS,
- PAGES = "204--211",
- MONTH = jul,
- YEAR = 1992
- }
-
- @ARTICLE{8,
- AUTHOR = "K. M. Chandy and L. Lamport",
- TITLE = "Distributed snapshots: Determining global states
- of distributed systems",
- JOURNAL = ACM-TOCS,
- VOLUME = 3,
- NUMBER = 1,
- PAGES = "63--75",
- MONTH = feb,
- YEAR = 1985
- }
-
- @ARTICLE{9,
- AUTHOR = "R. E. Strom and S. Yemini",
- TITLE = "Optimistic recovery in distributed systems",
- JOURNAL = ACM-TOCS,
- VOLUME = 3,
- NUMBER = 3,
- PAGES = "204--226",
- MONTH = aug,
- YEAR = 1985
- }
-
- @ARTICLE{10,
- AUTHOR = "T. H. Lai and T. H. Yang",
- TITLE = "On Distributed Snapshots",
- JOURNAL = IPL,
- VOLUME = 25,
- PAGES = "153--158",
- MONTH = may,
- YEAR = 1987
- }
-
- @ARTICLE{11,
- AUTHOR = "K. Venkatesh and T. Radhakrishnan and H. F. Li",
- TITLE = "Optimal Checkpointing and Local Recording for
- Domino-Free Rollback Recovery",
- JOURNAL = IPL,
- VOLUME = 25,
- PAGES = "295--303",
- MONTH = jul,
- YEAR = 1987
- }
-
- @ARTICLE{12,
- AUTHOR = "K. L. Wu and W. K. Fuchs and J. H. Patel",
- TITLE = "Error recovery in shared memory multiprocessors
- using private caches",
- JOURNAL = IEEE-TPDS,
- VOLUME = 1,
- NUMBER = 2,
- PAGES = "231--240",
- MONTH = apr,
- YEAR = 1990
- }
-
- @ARTICLE{13,
- AUTHOR = "Z. Tong and R. Y. Kain and W. T. Tsai",
- TITLE = "Rollback recovery in distributed systems using loosely
- synchronized clocks",
- JOURNAL = IEEE-TPDS,
- VOLUME = 3,
- NUMBER = 2,
- PAGES = "246--251",
- MONTH = mar,
- YEAR = 1992
- }
-
- @ARTICLE{14,
- AUTHOR = "K. L. Wu and W. K. Fuchs",
- TITLE = "Recoverable distributed shared virtual memory",
- JOURNAL = IEEE-TC,
- VOLUME = 39,
- NUMBER = 4,
- PAGES = "460--469",
- MONTH = apr,
- YEAR = 1990
- }
-
- @ARTICLE{15,
- AUTHOR = "E. N. Elnozahy and W. Zwaenepoel",
- TITLE = "Manetho: {T}ransparent rollback-recovery with low
- overhead, limited rollback and fast output commit",
- JOURNAL = IEEE-TC,
- VOLUME = 41,
- NUMBER = 5,
- PAGES = "526--531",
- MONTH = may,
- YEAR = 1992
- }
-
- @ARTICLE{16,
- AUTHOR = "Kai Li and Jeffrey F. Naughton and James S. Plank",
- TITLE = "An efficient checkpointing method for
- multicomputers with wormhole routing",
- JOURNAL = "International Journal of Parallel Processing",
- VOLUME = 20,
- NUMBER = 3,
- MONTH = jun,
- YEAR = 1992
- }
-
- @INPROCEEDINGS{17,
- AUTHOR = "R. E. Ahmed and R. C. Frazier and P. N. Marinos",
- TITLE = "Cache-aided rollback error recovery (CARER)
- algorithms for shared-memory multiprocessor systems",
- BOOKTITLE = FTCS,
- PAGES = "82--88",
- YEAR = 1990
- }
-
- @INPROCEEDINGS{18,
- AUTHOR = "K. H. Kim and J. H. You and A. Abouelnaga",
- TITLE = "A scheme for coordinated execution of independently
- designed recoverable distributed processes",
- BOOKTITLE = FTCS,
- PAGES = "130--135",
- YEAR = 1986
- }
-
- @INPROCEEDINGS{19,
- AUTHOR = "K. H. Kim and J. H. You",
- TITLE = "A highly decentralized implementation model for the
- {P}rogrammer-{T}ransparent {C}oordination ({PTC})
- scheme for cooperative recovery",
- BOOKTITLE = FTCS,
- PAGES = "282--289",
- YEAR = 1990
- }
-
- @INPROCEEDINGS{20,
- AUTHOR = "K. H. Kim",
- TITLE = "Designing fault tolerance capabilities into
- real-time distributed computer systems",
- BOOKTITLE = "Proc. Workshop on the Future Trends of Distributed
- Computing Systems in the 1990's",
- PAGES = "318--328",
- YEAR = 1988
- }
-
- @INPROCEEDINGS{21,
- AUTHOR = "E. Nett",
- TITLE = "The recovery problem in distributed systems",
- BOOKTITLE = "Proc. Workshop on the Future Trends of Distributed
- Computing Systems in the 1990's",
- PAGES = "357--365",
- YEAR = 1988
- }
-
- @INPROCEEDINGS{22,
- AUTHOR = "S. T. Gregory and J. C. Knight",
- TITLE = "On the provision of backward error recovery in
- production programming languages",
- BOOKTITLE = FTCS,
- PAGES = "506--511",
- YEAR = 1989
- }
-
- @INPROCEEDINGS{23,
- AUTHOR = "K. Tsuruoka and A. Kaneko and Y. Nishihara",
- TITLE = "Dynamic recovery schemes for distributed processes",
- BOOKTITLE = "Proc. IEEE 2nd Symp. on Reliability
- in Distributed Software and Database Systems",
- PAGES = "124--130",
- YEAR = 1981
- }
-
- @INPROCEEDINGS{24,
- AUTHOR = "D. Briatico and A. Ciuffoletti and L. Simoncini",
- TITLE = "A Distributed Domino-Effect Free Recovery Algorithm",
- BOOKTITLE = "Proc. IEEE 4th Symp. on Reliability
- in Distributed Software and Database Systems",
- PAGES = "207--215",
- YEAR = 1984
- }
-
- @INPROCEEDINGS{25,
- AUTHOR = "F. Cristian and F. Jahanian",
- TITLE = "A timestamp-based checkpointing protocol for
- long-lived distributed computations",
- BOOKTITLE = SRDS,
- PAGES = "12--20",
- YEAR = 1991
- }
-
- @INPROCEEDINGS{26,
- AUTHOR = "Kai Li and J. F. Naughton and J. S. Plank",
- TITLE = "Checkpointing multicomputer applications",
- BOOKTITLE = SRDS,
- PAGES = "2--11",
- YEAR = 1991
- }
-
- @INPROCEEDINGS{27,
- AUTHOR = "Kai Li and J. F. Naughton and J. S. Plank",
- TITLE = "Real-time, concurrent checkpointing for parallel
- programs",
- BOOKTITLE = "Proc. 2nd ACM SIGPLAN Symp. on Principles and Practice
- of Parallel Programming",
- PAGES = "79--88",
- MONTH = mar,
- YEAR = 1990
- }
-
- @INPROCEEDINGS{28,
- AUTHOR = "M. Ahamad and L. Lin",
- TITLE = "Using checkpoints to localize the effects of faults
- in distributed systems",
- BOOKTITLE = SRDS,
- PAGES = "2--11",
- YEAR = 1989
- }
-
- @INPROCEEDINGS{29,
- AUTHOR = "A. Borg and J. Baumbach and S. Glazer",
- TITLE = "A message system supporting fault-tolerance",
- BOOKTITLE = "Proc. 9th ACM Symp. on Operating Systems Principles",
- PAGES = "90--99",
- YEAR = 1983
- }
-
- @INPROCEEDINGS{30,
- AUTHOR = "M. L. Powell and D. L. Presotto",
- TITLE = "Publishing: A reliable broadcast communication mechanism",
- BOOKTITLE = "Proc. 9th ACM Symp. on Operating Systems Principles",
- PAGES = "100--109",
- YEAR = 1983
- }
-
- @INPROCEEDINGS{31,
- AUTHOR = "A. P. Sistla and J. L. Welch",
- TITLE = "Efficient distributed recovery using message logging",
- BOOKTITLE = "Proc. 8th ACM Symposium on Principles of Distributed Computing",
- PAGES = "223--238",
- YEAR = 1989
- }
-
- @INPROCEEDINGS{32,
- AUTHOR = "M. Spezialetti and P. Kearns",
- TITLE = "Efficient distributed snapshots",
- BOOKTITLE = ICDCS,
- PAGES = "382--388",
- YEAR = 1986
- }
-
- @INPROCEEDINGS{33,
- AUTHOR = "T. T-Y. Juang and S. Venkatesan",
- TITLE = "Crash recovery with little overhead",
- BOOKTITLE = ICDCS,
- PAGES = "454--461",
- YEAR = 1991
- }
-
- @INPROCEEDINGS{34,
- AUTHOR = "S. Venkatesan",
- TITLE = "Message-optimal incremental snapshots",
- BOOKTITLE = ICDCS,
- PAGES = "53--60",
- YEAR = 1989
- }
-
- @INPROCEEDINGS{35,
- AUTHOR = "B. Bhargava and S. R. Lian",
- TITLE = "Independent checkpointing and concurrent rollback
- for recovery - {A}n optimistic approach",
- BOOKTITLE = SRDS,
- PAGES = "3--12",
- YEAR = 1988
- }
-
- @INPROCEEDINGS{36,
- AUTHOR = "P. Ramanathan and K. G. Shin",
- TITLE = "Checkpointing and Rollback Recovery in a Distributed
- System Using Common Time Base",
- BOOKTITLE = SRDS,
- PAGES = "13--21",
- YEAR = 1988
- }
-
- @INPROCEEDINGS{37,
- AUTHOR = "A. Lowry and J. R. Russell and A. P. Goldberg",
- TITLE = "Optimistic Failure Recovery for Very Large Networks",
- BOOKTITLE = SRDS,
- PAGES = "66--75",
- YEAR = 1991
- }
-
- @INPROCEEDINGS{38,
- AUTHOR = "A. P. Goldberg and A. Gopal and A. Lowry
- and R. E. Strom",
- TITLE = "Restoring consistent global states of distributed
- computations",
- BOOKTITLE = "Proc. ACM/ONR Workshop on Parallel and Distributed
- Debugging",
- MONTH = may,
- YEAR = 1991
- }
-
- @INPROCEEDINGS{39,
- AUTHOR = "A. P. Goldberg and A. Gopal and K. Li
- and R. E. Strom and D. F. Bacon",
- TITLE = "Transparent recovery of {M}ach applications",
- BOOKTITLE = "First USENIX Mach Workshop",
- MONTH = oct,
- YEAR = 1990
- }
-
- @INPROCEEDINGS{40,
- AUTHOR = "D. B. Johnson and W. Zwaenepoel",
- TITLE = "Sender-Based Message Logging",
- BOOKTITLE = FTCS,
- PAGES = "14--19",
- YEAR = 1987
- }
-
- @INPROCEEDINGS{41,
- AUTHOR = "R. E. Strom and D. F. Bacon and S. A. Yemini",
- TITLE = "Volatile Logging in n-Fault-Tolerant Distributed
- Systems",
- BOOKTITLE = FTCS,
- PAGES = "44--49",
- YEAR = 1988
- }
-
- @ARTICLE{42,
- AUTHOR = "K. G. Shin and Y.-H. Lee",
- TITLE = "Evaluation of Error Recovery Blocks Used for
- Cooperating Processes",
- JOURNAL = IEEE-TSE,
- VOLUME = 10,
- NUMBER = 6,
- PAGES = "692--700",
- YEAR = 1984
- }
-
- @ARTICLE{43,
- AUTHOR = "Y.-H. Lee and K. G. Shin",
- TITLE = "Design and evaluation of a fault-tolerant
- multiprocessor using hardware recovery blocks",
- JOURNAL = IEEE-TC,
- VOLUME = "C-33",
- NUMBER = 2,
- PAGES = "113--124",
- YEAR = 1984
- }
-
- @ARTICLE{44,
- AUTHOR = "D. B. Johnson and W. Zwaenepoel",
- TITLE = "Recovery in distributed systems using
- optimistic message logging and checkpointing",
- JOURNAL = "J. of Algorithms",
- VOLUME = 11,
- PAGES = "462--491",
- YEAR = 1990
- }
-
- @ARTICLE{45,
- AUTHOR = "D. B. Johnson and W. Zwaenepoel",
- TITLE = "Transparent optimistic rollback recovery",
- JOURNAL = ACM-SIGOPS,
- PAGES = "99--102",
- MONTH = apr,
- YEAR = 1991
- }
-
- @ARTICLE{46,
- AUTHOR = "D. F. Bacon",
- TITLE = "Transparent Recovery in Distributed Systems",
- JOURNAL = ACM-SIGOPS,
- PAGES = "91--94",
- MONTH = apr,
- YEAR = 1991
- }
-
- @ARTICLE{47,
- AUTHOR = "A. Borg and W. Blau and W. Graetsch and F. Herrmann and W. Oberle",
- TITLE = "Fault tolerance under {UNIX}",
- JOURNAL = ACM-TOCS,
- VOLUME = 7,
- NUMBER = 1,
- PAGES = "1--24",
- MONTH = feb,
- YEAR = 1989
- }
-
- @UNPUBLISHED{48,
- AUTHOR = "E. N. Elnozahy and D. B. Johnson and W. Zwaenepoel",
- TITLE = "The performance of consistent checkpointing ",
- NOTE = "To appear in {\it Proc. IEEE 11th Symp. on Reliable Distributed Systems}",
- MONTH = oct,
- YEAR = 1992
- }
-
- @UNPUBLISHED{49,
- AUTHOR = "Y. M. Wang and W. K. Fuchs",
- TITLE = "{O}ptimistic message logging for
- independent checkpointing in message-passing systems",
- NOTE = "To appear in {\it Proc. IEEE 11th Symp. on Reliable Distributed Systems}",
- MONTH = oct,
- YEAR = 1992
- }
-
- @UNPUBLISHED{50,
- AUTHOR = "A. Acharya and B. R. Badrinath",
- TITLE = "Recording distributed snapshots based on causal
- order of message delivery",
- NOTE = "To appear in {\it Information Processing Letters}",
- YEAR = 1992
- }
-
- @UNPUBLISHED{51,
- AUTHOR = "J. Long and W. K. Fuchs",
- TITLE = "An evolutionary approach to coordinated checkpointing",
- NOTE = "Submitted to {\it IEEE Trans. on Parallel and Distributed Systems}",
- YEAR = 1992
- }
-
- @TECHREPORT{52,
- AUTHOR = "Y. M. Wang and P. Y. Chung and I. J. Lin and W. K. Fuchs",
- TITLE = "Reducing space overhead for independent checkpointing",
- INSTITUTION = UIUC-CRHC,
- NUMBER = "CRHC-92-06",
- YEAR = 1992
- }
-
- @TECHREPORT{53,
- AUTHOR = "Carol Critchlow and Kim Taylor",
- TITLE = "The Inhibition Spectrum and the Achievement of
- Causal Consistency",
- INSTITUTION = Cornell University,
- NUMBER = "TR 90-1101",
- MONTH = feb,
- YEAR = 1990
- }
-
- @TECHREPORT{54,
- AUTHOR = "M. Ahuja",
- TITLE = "Repeated global snapshots in asynchronous
- distributed systems",
- INSTITUTION = Ohio State University,
- NUMBER = "OSU-CISRC-8/89 TR40",
- MONTH = aug,
- YEAR = 1989
- }
-
- @TECHREPORT{55,
- AUTHOR = "M. Frans Kaashoek and Raymond Michiels and
- Henri E. Bal and Andrew S. Tanenbaum",
- TITLE = "Transparent Fault-Tolerance in Parallel
- {O}rca Programs",
- INSTITUTION = Vrije Universiteit, Amsterdam
- NUMBER = "IR-258",
- MONTH = oct,
- YEAR = 1991
- }
-