%% This BibTeX bibliography file was created using BibDesk. %% http://www.cs.ucsd.edu/~mmccrack/bibdesk.html %% Created for Matthias Wiesmann at 2006-11-06 17:29:26 +0900 %% Saved with string encoding Western (ASCII) @article{Chu98, Author = {Chu, F.}, Date-Added = {2006-10-31 10:29:45 +0900}, Date-Modified = {2006-11-06 17:00:47 +0900}, Journal = {Information Processing Letters}, Keywords = {leader election, failure detection}, Language = {English}, Month = {September}, Number = {6}, Pages = {289--293}, Title = {Reducing {$\Omega$} to {W}}, Url = {http://md1.csa.com/partners/viewrecord.php?requester=gs&collection=TRD&recid=363387CI&q=&uid=789132921&q=&uid=789132921}, Volume = {67}, Year = {1998}, Abstract = {One of the most important problems in fault-tolerant distributed computing is consensus. Unfortunately, consensus is not solvable in an asynchronous system with even a single crash failure. To solve this, one approach is to augment the system with a failure detector D, a distributed oracle that gives processes hints about which processes have crushed. Each process p has access to a local module D sub(p) which it can query to get hints about failures. In general, p will see only a subsequence of the outputs of D sub(p) due to the asynchronous nature of the system. If p suspects q at time t if at time t, the output of D sub(p) suggests that q is crashed. If, in addition, p sees this output as time t, then p doubts q at time t. }} @inproceedings{MOZ05, Author = {Malkhi, D. and Oprea, F. and Zhou, L.}, Booktitle = {Proc. of the 19${th}$ Int. Symp. on Distributed Computing (DISC'05)}, Date-Added = {2006-10-25 15:18:48 +0900}, Date-Modified = {2006-10-25 15:36:55 +0900}, Keywords = {leader election, consensus}, Language = {English}, Month = {July}, Pages = {199--213}, Publisher = {Springer-Verlag}, Title = {{$\Omega$} Meets Paxos: Leader Election and Stability without Eventual Timely Links}, Url = {ftp://ftp.research.microsoft.com/pub/tr/TR-2005-93.pdf}, Year = {2005}, Abstract = {This paper provides a realization of distributed leader election without having any eventual timely links. Progress is guaranteed in the following weak setting: Eventually one process can send messages such that every message obtains f timely responses, where f is a resilience bound. A crucial facet of this property is that the f responders need not be fixed, and may change from one message to another. In particular, this means that no specific link needs to remain timely. In the (common) case where f=1, this implies that the FLP impossibility result on consensus is circumvented if one process can at any time communicate in a timely manner with one other process in the system. The protocol also bears significant practical importance to well-known coordination schemes such as Paxos, because our setting more precisely captures the conditions on the elected leader for reaching timely consensus. Additionally, an extension of our protocol provides leader stability, which guarantees against arbitrary demotion of a qualified leader and avoids performance penalties associated with leader changes in schemes such as Paxos.}} @inproceedings{ADFT04, Author = {Aguilera, M. K. and Delporte-Gallet, C. and Fauconnier, H. and Toueg, S.}, Booktitle = {Proc. of the 23$^{rd}$ annual symp. on Principles of distr. computing (PODC '04)}, Date-Added = {2006-10-25 15:10:00 +0900}, Date-Modified = {2006-10-31 10:41:08 +0900}, Isbn = {1-58113-802-4}, Keywords = {leader election}, Location = {St. John's, Newfoundland, Canada}, Organization = {ACM}, Pages = {328--337}, Title = {Communication-efficient leader election and consensus with limited link synchrony}, Url = {http://doi.acm.org/10.1145/1011767.1011816}, Year = {2004}, Abstract = {We study the degree of synchrony required to implement the leader election failure detector {$\Omega$} and to solve consensus in partially synchronous systems. We show that in a system with n processes and up to f process crashes, one can implement {$\Omega$} and solve consensus provided there exists some (unknown) correct process with f outgoing links that are eventually timely. In the special case where f = 1 , an important case in practice, this implies that to implement {$\Omega$} and solve consensus it is sufficient to have just one eventually timely link -- all the other links in the system, {\Theta}(n2) of them, may be asynchronous. There is no need to know which link p {\rightarrow} q is eventually timely, when it becomes timely, or what is its bound on message delay. Surprisingly, it is not even required that the source p or destination q of this link be correct: either p or q may actually crash, in which case the link p {\rightarrow} q is eventually timely in a trivial way, and it is useless for sending messages. We show that these results are in a sense optimal: even if every process has f - 1 eventually timely links, neither {$\Omega$} nor consensus can be solved. We also give an algorithm that implements {$\Omega$} in systems where some correct process has f outgoing links that are eventually timely, such that eventually only f links carry messages, and we show that this is optimal. For f = 1 , this algorithm ensures that all the links, except for one, eventually become quiescent.}} @inproceedings{FJR06, Author = {Fern{\'a}ndez, A. and Jim{\'e}nez, E. and Raynal, M.}, Booktitle = {Proceedings of the Int. Conf. on Dependable Systems and networks (DSN'06)}, Date-Added = {2006-10-24 13:37:09 +0900}, Date-Modified = {2006-11-06 17:04:38 +0900}, ISBN = {0-7695-2607-1}, Keywords = {Failure detection, leader election}, Language = {English}, Month = {June}, Organization = {IEEE}, Pages = {166--189}, Title = {Eventual Leader Election with Weak Assumptions on Initial Knowledge, Communication Reliability and Synchrony}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2006.34}, Year = {2006}, Abstract = {This paper considers the eventual leader election problem in asynchronous message-passing systems where an arbitrary number t of processes can crash (t < n, where n is the total number of processes). It considers weak assumptions both on the initial knowledge of the processes and on the network behavior. More precisely, initially, a process knows only its identity and the fact that the process identities are different and totally ordered (it knows neither n nor t). Two eventual leader election protocols are presented. The first protocol assumes that a process also knows the lower bound on the number of processes that do not crash. This protocol requires the following behavioral properties from the underlying network: the graph made up of the correct processes and fair lossy links is strongly connected, and there is a correct process connected to t - f other correct processes (where f is the actual number of crashes in the considered run) through eventually timely paths (paths made up of correct processes and eventually timely links). This protocol is not communication-efficient in the sense that each correct process has to send messages forever. The second protocol is communication-efficient: after some time, only the final common leader has to send messages forever. This protocol does not require the processes to know, but requires stronger properties from the underlying network: each pair of correct processes has to be connected by fair lossy links (one in each direction), and there is a correct process whose output links to the rest of correct processes have to be eventually timely. This protocol enjoys also the property that each message is made up of several fields, each of which taking values from a finite domain.}} @article{WS05, Address = {Los Alamitos, CA, USA}, Author = {Wiesmann, Matthias and Schiper, Andre}, Date-Added = {2006-10-16 19:09:35 +0900}, Date-Modified = {2006-10-16 19:10:08 +0900}, Doi = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2005.54}, Issn = {1041-4347}, Journal = {IEEE Transactions on Knowledge and Data Engineering}, Number = {4}, Pages = {551-566}, Publisher = {IEEE Computer Society}, Title = {Comparison of Database Replication Techniques Based on Total Order Broadcast}, Volume = {17}, Year = {2005}, Abstract = {In this paper, we present a performance comparison of database replication techniques based on total order broadcast. While the performance of total order broadcast-based replication techniques has been studied in previous papers, this paper presents many new contributions. First, it compares with each other techniques that were presented and evaluated separately, usually by comparing them to a classical replication scheme like distributed locking. Second, the evaluation is done using a finer network model than previous studies. Third, the paper compares techniques that offer the same consistency criterion (one-copy serializability) in the same environment using the same settings. The paper shows that, while networking performance has little influence in a LAN setting, the cost of synchronizing replicas is quite high. Because of this, total order broadcast-based techniques are very promising as they minimize synchronization between replicas.}} @inproceedings{MS05, Author = {Mena, S. and Schiper, A.}, Booktitle = {Proc. of the 24$^{th}$ Symp. on Reliable Distributed Systems (SRDS)}, Date-Added = {2006-08-23 18:47:56 +0900}, Date-Modified = {2006-09-15 15:23:35 +0900}, Isbn = {0-7695-2463-X}, Keywords = {atomic broadcast}, Language = {English}, Organization = {IEEE}, Pages = {202--214}, Title = {A new look at atomic broadcast in the asynchronous crash-recovery model}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDIS.2005.6}, Year = {2005}, Abstract = {Atomic broadcast in particular, and group communication in general, have mainly been specified and implemented in a system model where processes do not recover after a crash. The model is called crash-stop. The drawback of this model is its inability to express algorithms that tolerate the crash of a majority of processes. This has led to extend the crash-stop model to the so-called crash-recovery model, in which processes have access to stable storage, to log their state periodically. This allows them to recover a previous state after a crash. However, the existing specifications of atomic broadcast in the crash-recovery model are not satisfactory, and the paper explains why. The paper also proposes a new speci fication of atomic broadcast in the crash-recovery model that addresses these issues. Specifically, our new specifi- cation allows to distinguish between a uniform and a nonuniform version of atomic broadcast. The non-uniform version logs less information, and is thus more efficient. The uniform and non-uniform atomic broadcast have been implemented and compared with a published atomic broadcast algorithm. Performance results are presented.}} @inproceedings{BLKC03, Author = {Bouteiller, A. and Lemarinier, P. and Krawezik, G. and Cappello, F.}, Booktitle = {Proc. of the Int. Conf. on Cluster Computing (CLUSTER)}, Date-Added = {2006-08-23 18:05:38 +0900}, Date-Modified = {2006-08-23 18:17:44 +0900}, Isbn = {0-7695-2066-9}, Organization = {IEEE}, Pages = {242--250}, Title = {Coordinated Checkpoint versus Message Log for Fault Tolerant {MPI}}, Url = {http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2003.1253321}, Year = {2003}, Abstract = {MPI is one of the most adopted programming models for Large Clusters and Grid deployments. However, these systems often suffer from network or node failures. This raises the issue of selecting a fault tolerance approach for MPI. Automatic and transparent ones are based on either coordinated checkpointing or message logging associated with uncoordinated checkpoint. They are many protocols, implementations and optimizations for these approaches but few results about their comparison. Coordinated checkpoint has the advantage of a very low overhead on fault free executions. In contrary a message logging protocol systematically adds a significant message transfer penalty. The drawbacks of coordinated checkpoint come from its synchronization cost at checkpoint and restart times. In this paper we implement, evaluate and compare the two kinds of protocols with a special emphasis on their respective performance according to fault frequency. The main conclusion (under our experimental conditions) is that message logging becomes relevant for a large scale cluster from one fault every hour for applications with large dataset.}} @inproceedings{Sch04, Author = {Schiper, A.}, Booktitle = {Proc. of the 3$^{rd}$ Int. Symp. on Network Computing and Applications (NCA 2004)}, Date-Added = {2006-08-23 11:08:06 +0900}, Date-Modified = {2006-08-23 12:00:02 +0900}, Isbn = {0-7695-2242-4}, Keywords = {group communication, position-paper}, Organization = {IEEE}, Pages = {109--117}, Title = {Group communication: where are we today and future challenges}, Url = {http://doi.ieeecomputersociety.org/10.1109/NCA.2004.1347768}, Year = {2004}, Abstract = {Group communication is a topic studied since more than twenty years. During this period significant results have been obtained. Nevertheless there are issues that have not been addressed adequately. The paper presents both the important results related to group communication that have been obtained, but also points out what remains to be done to make group communication widely used, and as successful as some of the existing middleware technologies.}} @article{RAV00, Author = {Rao, Sriram and Alvisi, Lorenzo and Vin, Harrick M.}, Date-Added = {2006-08-23 11:06:09 +0900}, Date-Modified = {2006-08-23 11:06:59 +0900}, Issn = {1041--4347}, Journal = {IEEE Transactions on Knowledge and Data Engineering}, Keywords = {message logging}, Number = {2}, Pages = {160-173}, Title = {The Cost of Recovery in Message Logging Protocols}, Url = {http://doi.ieeecomputersociety.org/10.1109/69.842260}, Volume = {12}, Year = {2000}, Abstract = {Past research in message logging has focused on studying the relative overhead imposed by pessimistic, optimistic, and causal protocols during failure-free executions. In this paper, we give the first experimental evaluation of the performance of these protocols during recovery. Our results suggest that applications face a complex trade-off when choosing a message logging protocol for fault tolerance. On the one hand, optimistic protocols can provide fast failure-free execution and good performance during recovery, but are complex to implement and can create orphan processes. On the other hand, orphan-free protocols either risk being slow during recovery, e.g., sender-based pessimistic and causal protocols, or incur a substantial overhead during failure-free execution, e.g., receiver-based pessimistic protocols. To address this trade-off, we propose hybrid logging protocols, a new class of orphan-free protocols. We show that hybrid protocols perform within two percent of causal logging during failure-free execution and within two percent of receiver-based logging during recovery.}} @article{GR04, Author = {Guerraoui, R. and Raynal, M.}, Date-Added = {2006-08-23 11:02:43 +0900}, Date-Modified = {2006-08-23 11:05:11 +0900}, Issn = {0018-9340}, Journal = {{IEEE} Transactions on Computers}, Keywords = {Consensus}, Language = {English}, Month = {April}, Number = {4}, Pages = {453--466}, Title = {The information structure of indulgent consensus}, Volume = {53}, Year = {2004}} @inproceedings{MHR98, Author = {Mostefaoui, A. and Hurfin, M. and Raynal, M.}, Booktitle = {Proc. of the 17$^{th}$ Symp. on Reliable Distr. Syst. (SRDS)}, Date-Added = {2006-08-23 10:48:18 +0900}, Date-Modified = {2006-08-23 10:58:31 +0900}, Issn = {1060-9857}, Keywords = {consensus, crash recovery}, Organization = {IEEE}, Pages = {280--287}, Title = {Consensus in Asynchronous Systems Where Processes Can Crash and Recover}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDIS.1998.740510}, Year = {1998}, Abstract = {The Consensus problem is now well identified as being one of the most important problems encountered in the design and the construction of fault-tolerant distributed systems. This problem is defined as follows: processes have to reach a common decision, which depends on their inputs, despite failures. We consider the Consensus problem in asynchronous distributed systems augmented with unreliable failure detectors. Several protocols have been proposed for these systems, when process crashes are assumed to be definitive. This paper addresses the Consensus problem in a more practical asynchronous system model, namely in a context where processes can crash and recover. As a process crash entails the loss of its volatile memory, each process is equipped with a stable storage. So, to be efficient a Consensus protocol has to log as few critical data as possible. The proposed protocol uses a new class of failure detectors suited to the crash/recovery model. It is particularly efficient when, whether there are crashes or not, the underlying failure detector makes few mistakes. Additionally, the proposed protocol tolerates message duplication and copes with some message losses. }} @inproceedings{WUD06b, Author = {Wiesmann, M. and Urb{\'a}n, P. and D{\'e}fago, X}, Booktitle = {Proceedings of the Symposium on Reliable Distributed Systems (SRDS 2006)}, Date-Added = {2006-08-01 11:42:56 +0900}, Date-Modified = {2006-08-01 12:00:42 +0900}, Keywords = {SNMP, failure detection}, Language = {English}, Location = {Leeds, UK}, Month = {October}, Organization = {IEEE}, Title = {An {SNMP} based failure detection service}, Url = {http://ddsg.jaist.ac.jp/en/pub/WUD06b.html}, Year = {2006}, Abstract = {In this paper, we present the SNMP-FD service, a novel failure detection service entirely based on the Simple Network Management Protocol (SNMP). This approach promises better interoperability with external tools and failure information sources, including network equipment and cluster management tools. We first show how the SNMP standard can be used to build a failure detection service. We describe the already standardized interfaces that can be reused and introduce the interfaces that need to be added. SNMP is used extensively in the service: for messaging, process status description, configuration, services statistics and delivering failure detection information to applications. We then present our implementation and an evaluation of performance and quality of service. }} @inproceedings{WD06, Author = {Wiesmann, M. and D{\'e}fago, X.}, Booktitle = {Proceedings of the International Symposium Pacific Rim Dependable Computing (PRDC'06)}, Date-Added = {2006-08-01 11:41:52 +0900}, Date-Modified = {2006-08-01 11:42:39 +0900}, Keywords = {Consensus, end-to-end}, Language = {English}, Location = {Riverside, USA}, Month = {December}, Organization = {IEEE}, Title = {End-to-end consensus using end-to-end channels}, Url = {http://ddsg.jaist.ac.jp/en/pub/WD06.html}, Year = {2006}} @inproceedings{ADFT03, Author = {Aguilera, M. K. and Delporte-Gallet, C. and Fauconnier, H. and Toueg, S.}, Booktitle = {Proc. of the 22$^{nd}$ annual Symp. on Principles of Distributed Computing (PODC'03)}, Date-Added = {2006-07-24 18:54:53 +0900}, Date-Modified = {2006-10-25 15:08:56 +0900}, Isbn = {1-58113-708-7}, Keywords = {failure detection, omega}, Language = {English}, Location = {Boston, Massachusetts, USA}, Pages = {306--314}, Title = {On implementing {$\Omega$} with weak reliability and synchrony assumptions}, Url = {http://doi.acm.org/10.1145/872035.872081}, Year = {2003}, Abstract = {We study the feasibility and cost of implementing {$\Omega$}---a fundamental failure detector at the core of many algorithms---in systems with weak reliability and synchrony assumptions. Intuitively, {$\Omega$} allows processes to eventually elect a common leader. We first give an algorithm that implements {$\Omega$} in a weak system S where processes are synchronous, but: (a) any number of them may crash, and (b) only the output links of an unknown correct process are eventually timely (all other links can be asynchronous and/or lossy). This is in contrast to previous implementations of {$\Omega$} which assume that a quadratic number of links are eventually timely, or systems that are strong enough to implement the eventually perfect failure detector P. We next show that implementing {$\Omega$} in S is expensive: even if we want an implementation that tolerates just one process crash, all correct processes (except possibly one) must send messages forever; moreover, a quadratic number of links must carry messages forever. We then show that with a small additional assumption---the existence of some unknown correct process whose asynchronous links are lossy but fair---we can implement {$\Omega$} efficiently: we give an algorithm for {$\Omega$} such that eventually only one process (the elected leader) sends messages.}} @inproceedings{ADF01, Author = {Aguilera, M. K. and Delporte-Gallet, C. and Fauconnier, H.}, Booktitle = {Proc. of the 15$^{th}$ Int. Conf. on Distributed Computing ({DISC'01})}, Date-Added = {2006-07-19 13:46:15 +0900}, Date-Modified = {2006-11-06 17:09:06 +0900}, Isbn = {3-540-42605-1}, Journal = {Lectures Notes in Computer Science}, Keywords = {leader election}, Language = {English}, Pages = {108 -- 122}, Publisher = {Springer-Verlag}, Title = {Stable Leader Election}, Url = {http://gatekeeper.dec.com/pub/DEC/SRC/publications/aguilera/leader.pdf}, Volume = {2180}, Year = {2001}, Abstract = {We introduce the notion of stable leader election and derive several algorithms for this problem. Roughly speaking, a leader election algorithm is stable if it ensures that once a leader is elected, it remains the leader for as long as it does not crash and its links have been behaving well, irrespective of the behavior of other processes and links. In addition to being stable, our leader election algorithms have several desirable properties. In particular, they are all communication-efficient, i.e., they eventually use only n links to carry messages, and they are robust, i.e., they work in systems where only the links to/from some correct process are required to be eventually timely. Moreover, our best leader election algorithm tolerates message losses, and it ensures that a leader is elected in constant time when the system is stable. We conclude the paper by applying the above ideas to derive a robust and efficient algorithm for the eventually perfect failure detector {\diamond}P. }} @techreport{RFC2741, Author = {Daniele, M. and Wijnen, B. and Ellison, M.}, Date-Added = {2006-07-16 13:40:54 +0900}, Date-Modified = {2006-07-16 13:42:19 +0900}, Institution = {IETF}, Keywords = {RFC, SNMP}, Language = {English}, Number = {2741}, Title = {Agent Extensibility ({AgentX}) Protocol Version 1}, Type = {RFC}, Url = {ftp://ftp.isi.edu/in-notes/rfc2741.txt}, Year = {2000}} @manual{FT-CORBA1, Date-Added = {2006-07-16 11:45:15 +0900}, Date-Modified = {2006-07-16 11:52:14 +0900}, Keywords = {CORBA, fault-tolerance}, Language = {English}, Month = {April}, Note = {ptc/2000-04-04}, Organization = {Object Management Group}, Title = {Fault Tolerant CORBA Specification, V1.0}, Url = {http://grouppac.sourceforge.net/grouppac/00-04-04.pdf}, Year = {2000}} @inproceedings{GK02, Author = {Guerraoui, R. and Kouznetsov, P.}, Bibdate = {2002-08-14}, Booktitle = {Proc. of the Int. Conf. on Theoretical Computer Science ({TCS})}, Date-Added = {2006-07-05 15:18:49 +0900}, Date-Modified = {2006-07-14 18:08:56 +0900}, Isbn = {1-4020-7181-7}, Keywords = {atomic commitment, failure detection}, Language = {English}, Location = {Montr{\'e}al, Qu{\'e}bec, Canada}, Month = {August}, Pages = {461--473}, Publisher = {Kluwer}, Title = {On the Weakest Failure Detector for Non-Blocking Atomic Commit}, Url = {http://lpdwww.epfl.ch/rachid/papers/TCS02.ps.gz}, Volume = {223}, Year = {2002}, Abstract = {This paper addresses the question of the weakest failure detector to solve the Non-Blocking Atomic Commit (NBAC) problem in an asynchronous system. We define the set of timeless failure detectors which excludes failure detectors that provide information about global time but includes all known meaningful failure detectors such as #S, #P and [2]. We show that, within the weakest failure detector for NBAC is P +#S.}, Annote = {R. A. Baeza-Yates and U. Montanari and N. Santoro}} @inproceedings{GGL03, Address = {Bolton Landing, NY, USA}, Author = {Ghemawat, S. and Gobioff, H. and Leung, S.}, Booktitle = {Proceedings of the nineteenth Symposium on Operating Systems Principles (SOSP'03)}, Date-Added = {2006-06-26 16:57:04 +0900}, Date-Modified = {2006-06-26 17:06:52 +0900}, Isbn = {1-58113-757-5}, Keywords = {Google, filesystem}, Language = {English}, Month = {October}, Organization = {ACM}, Pages = {29 -- 43}, Publisher = {ACM Press}, Title = {The {G}oogle File System}, Url = {http://doi.acm.org/10.1145/945445.945450}, Year = {2003}, Abstract = {We have designed and implemented the Google File System, a scalable distributed file system for large distributed data-intensive applications. It provides fault tolerance while running on inexpensive commodity hardware, and it delivers high aggregate performance to a large number of clients. While sharing many of the same goals as previous distributed file systems, our design has been driven by observations of our application workloads and technological environment, both current and anticipated, that reflect a marked departure from some earlier file system assumptions. This has led us to reexamine traditional choices and explore radically different design points. The file system has successfully met our storage needs. It is widely deployed within Google as the storage platform for the generation and processing of data used by our service as well as research and development efforts that require large data sets. The largest cluster to date provides hundreds of terabytes of storage across thousands of disks on over a thousand machines, and it is concurrently accessed by hundreds of clients. In this paper, we present file system interface extensions designed to support distributed applications, discuss many aspects of our design, and report measurements from both micro-benchmarks and real world use. }} @inproceedings{BP98, Address = {Brisbane, Australia}, Author = {Brin, S. and Page, L.}, Booktitle = {Proceedings of the seventh international conference on World Wide Web 7}, Date-Added = {2006-06-26 16:50:41 +0900}, Date-Modified = {2006-06-26 17:10:51 +0900}, Issn = {0169-7552}, Keywords = {Google, Internet Search}, Language = {English}, Pages = {107 --117}, Publisher = {Elsevier Science Publishers B. V.}, Title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine}, Url = {http://www-db.stanford.edu/pub/papers/google.pdf}, Year = {1998}, Abstract = {In this paper, we present Google, a prototype of a large-scale search engine which makes heavy use of the structure present in hypertext. Google is designed to crawl and index the Web efficiently and produce much more satisfying search results than existing systems. The prototype with a full text and hyperlink database of at least 24 million pages is available at http://google.stanford.edu/ To engineer a search engine is a challenging task. Search engines index tens to hundreds of millions of web pages involving a comparable number of distinct terms. They answer tens of millions of queries every day. Despite the importance of large-scale search engines on the web, very little academic research has been done on them. Furthermore, due to rapid advance in technology and web proliferation, creating a web search engine today is very different from three years ago. This paper provides an in-depth description of our large-scale web search engine -- the first such detailed public description we know of to date. Apart from the problems of scaling traditional search techniques to data of this magnitude, there are new technical challenges involved with using the additional information present in hypertext to produce better search results. This paper addresses this question of how to build a practical large-scale system which can exploit the additional information present in hypertext. Also we look at the problem of how to effectively deal with uncontrolled hypertext collections where anyone can publish anything they want.}} @inproceedings{GLB97a, Address = {Los Alamitos, CA, USA}, Author = {Gemmell, Jim and Liebeherr, Jorg and Bassett, Dave}, Booktitle = {Proceedings of Sixth International Conference on Computer Communications and Networks (ICCCN '97)}, Date-Added = {2006-05-14 17:04:22 +0900}, Date-Modified = {2006-06-30 13:19:45 +0900}, Issn = {1095-2055}, Keywords = {reliable broadcast}, Language = {English}, Pages = {60}, Publisher = {IEEE Computer Society}, Title = {An API for Scalable Reliable Multicast}, Url = {http://doi.ieeecomputersociety.org/10.1109/ICCCN.1997.623291}, Year = {1997}} @article{EAWJ02, Address = {New York, NY, USA}, Author = {Elnozahy, E. N. and Alvisi, L. and Wang, Y.-M. and Johnson, D. B.}, Date-Added = {2006-05-14 16:48:24 +0900}, Date-Modified = {2006-05-17 17:03:41 +0900}, Issn = {0360-0300}, Journal = {{ACM} Computing Surveys}, Keywords = {message passing, message logging}, Number = {3}, Pages = {375--408}, Publisher = {ACM Press}, Title = {A survey of rollback-recovery protocols in message-passing systems}, Url = {http://doi.acm.org/10.1145/568522.568525}, Volume = {34}, Year = {2002}, Abstract = {This survey covers rollback-recovery techniques that do not require special language constructs. In the first part of the survey we classify rollback-recovery protocols into checkpoint-based and log-based. Checkpoint-based protocols rely solely on checkpointing for system state restoration. Checkpointing can be coordinated, uncoordinated, or communication-induced. Log-based protocols combine checkpointing with logging of nondeterministic events, encoded in tuples called determinants. Depending on how determinants are logged, log-based protocols can be pessimistic, optimistic, or causal. Throughout the survey, we highlight the research issues that are at the core of rollback-recovery and present the solutions that currently address them. We also compare the performance of different rollback-recovery protocols with respect to a series of desirable properties and discuss the issues that arise in the practical implementations of these protocols.}} @inproceedings{JZ88, Author = {Johnson, D. B. and Zwaenepoel, W.}, Booktitle = {Proc. of the 7$^{th}$ annual Symp. on Principles of distributed computing (PODC)}, Date-Added = {2006-05-14 16:25:01 +0900}, Date-Modified = {2006-08-23 15:07:50 +0900}, Isbn = {0-89791-277-2}, Keywords = {Checkpointing, message passing, message logging}, Language = {English}, Location = {Toronto, Ontario, Canada}, Organization = {ACM}, Pages = {171 -- 181}, Title = {Recovery in distributed systems using asynchronous message logging and checkpointing}, Url = {http://doi.acm.org/10.1145/62546.62575}, Year = {1988}, Abstract = {In a distributed system using message logging and checkpointing to provide fault tolerance, there is always a unique maximum recoverable system state, regardless of the message logging protocol used. The proof of this relies on the observation that the set of system states that have occurred during any single execution of a system forms a lattice, with the sets of consistent and recoverable system states as sublattices. The maximum recoverable system state never decreases, and if all messages are eventually logged, the domino effect cannot occur. This paper presents a general model for reasoning about recovery in such a system and, based on this model, an efficient algorithm for determining the maximum recoverable system state at any time. This work unifies existing approaches to fault tolerance based on message logging and checkpointing, and improves on existing methods for optimistic recovery in distributed systems. }} @inproceedings{JZ87, Author = {Johnson, D. B. and Zwaenepoel, W.}, Booktitle = {The 17th annual international symposium on fault-tolerant computing}, Date-Added = {2006-05-14 16:02:22 +0900}, Date-Modified = {2006-05-17 17:03:35 +0900}, Keywords = {Checkpointing, message passing}, Language = {English}, Month = {June}, Organization = {IEEE Computer Society}, Pages = {14--19}, Title = {Sender-based message logging}, Url = {http://infoscience.epfl.ch/getfile.py?docid=5577&name=ftcs87&format=ps.pdf&version=1}, Year = {1987}} @inproceedings{JJGV00, Author = {Johnson, S. and Jahanian, F. and Ghosh, S. and Vanvoorst, B. and Weininger, N.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN)}, Date-Added = {2006-05-10 17:19:08 +0900}, Date-Modified = {2006-08-23 15:06:11 +0900}, Isbn = {0-7695-0707-7}, Keywords = {group communication, toolkit, distributed computing}, Language = {English}, Organization = {IEEE}, Pages = {37 -- 42}, Title = {Experiences with Group Communication Middleware}, Url = {http://www.eecs.umich.edu/~farnam/pubs/2000-jjg-dsn.pdf}, Year = {2000}, Abstract = {Group communication is a widely studied paradigm for building fault-tolerant distributed systems. The Armada project at the University of Michigan is a collaborative effort with the Honeywell Technology Center to study how real-world applications use group communication. In this paper, we describe the results of our experience implementing a fault-tolerant distributed radar tracking system, and discuss how we were able to simplify our design and implementation by utilizing additional services built on top of the group communication model.}} @inproceedings{MRT04, Author = {Mostefaoui, A. and Raynal, M. and Travers, C.}, Booktitle = {Proc. of the 23$^{rd}$ Int. Symp. on Reliable Distributed Systems (SRDS'04)}, Date-Added = {2006-05-02 12:22:27 +0900}, Date-Modified = {2006-11-06 17:07:37 +0900}, ISBN = {0-7695-2239-4}, Keywords = {leader election}, Month = {October}, Organization = {IEEE}, Pages = {208--217}, Title = {Crash-resilient time-free eventual leadership}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDIS.2004.1353022}, Year = {2004}, Abstract = {Leader-based protocols rest on a primitive able to provide the processes with the same unique leader. Such protocols are very common in distributed computing to solve synchronization or coordination problems. Unfortunately, providing such a primitive is far from being trivial in asynchronous distributed systems prone to process crashes. (It is even impossible in fault-prone purely asynchronous systems.) To circumvent this difficulty, several protocols have been proposed that build a leader facility on top of an asynchronous distributed system enriched with synchrony assumptions. This paper consider another approach to build a leader facility, namely, it considers a behavioral property on the flow of messages that are exchanged. This property has the noteworthy feature not to involve timing assumptions. Two protocols based on this time-free property that implement a leader primitive are described. The first one uses potentially unbounded counters, while the second one (which is a little more involved) requires only finite memory. These protocols rely on simple design principles that make them attractive, easy to understand and provably correct.}} @misc{SNMP4J, Date-Added = {2006-04-22 12:36:54 +0900}, Date-Modified = {2006-08-01 13:53:53 +0900}, Howpublished = {http://www.snmp4j.org/}, Key = {SNMP4J}, Keywords = {SNMP}, Title = {The {SNMP4} project}, Url = {http://www.snmp4j.org/}} @misc{SNMP-FD, Date-Added = {2006-04-11 10:13:50 +0900}, Date-Modified = {2006-07-16 12:39:01 +0900}, Howpublished = {http://ddsg.jaist.ac.jp/en/projects/snmp-fd/}, Key = {SNMP-FD}, Keywords = {SNMP, service}, Title = {{SNMP-FD}}, Url = {http://ddsg.jaist.ac.jp/en/projects/snmp-fd/}} @inproceedings{HTC05, Author = {Horita, Y. and Taura, K. and Chikayama, T.}, Booktitle = {Proc. of 6$^{th}$ Int. Workshop on Grid Computing ({GRID})}, Date-Added = {2006-04-04 18:40:12 +0900}, Date-Modified = {2006-07-15 11:19:44 +0900}, Keywords = {failure detection, grid}, Language = {English}, Location = {Seattle, Washington, USA}, Pages = {202-210}, Title = {A Scalable and Efficient Self-Organizing Failure Detector for Grid Applications}, Url = {http://www.logos.t.u-tokyo.ac.jp/~tau/papers/horita-grid2005-scalable.pdf}, Year = {2005}, Abstract = {Failure detection and group membership management are basic building blocks for self-repairing systems in distributed environments, which need to be scalable, reliable, and efficient in practice. Besides, now that a great number of available resources are becoming more widely distributed, it is more essential that they can be easily used with less manual configurations in Grid environments, where connectivity between different networks may be limited by firewalls and NATs. In this paper, we present a scalable failure detection protocol which self-organizes even in Grid environments. Our failure detector autonomously creates dispersed monitoring relations among participating processes so that any process would be monitored by a small number of other processes, and quickly disseminates notification along the monitoring relations if failures are detected. With simulations and real experiments, we showed that our failure detector has high scalability, high reliability, and high efficiency practically.}} @inproceedings{JS04, Author = {Jain, A. and Shyamasundar, R. K.}, Booktitle = {Proc. of the 5$^{th}$ Int. Workshop on Grid Computing ({GRID})}, Date-Added = {2006-04-04 18:36:37 +0900}, Date-Modified = {2006-07-15 12:01:26 +0900}, Keywords = {failure detection, grid, group membership, service}, Language = {English}, Pages = {44--52}, Title = {Failure Detection and Membership Management in Grid Environments}, Url = {http://doi.ieeecomputersociety.org/10.1109/GRID.2004.30}, Year = {2004}, Abstract = {Failure detectors are an integral part of any fault tolerant distributed system and hence have been a well-studied area. However, earlier proposed failure detectors fail to perform efficiently when applied to Grid environments. Most of the earlier proposed detectors were either designed for local area networks or to handle small number of nodes and hence lack in areas such as scalability, efficiency, running times etc. In this paper we propose a highly scalable failure detector protocol that is aided by a membership management service. The membership management service is essential to make the failure detector transparent to changes in the system. Using a distributed heartbeat mechanism, for an unreliable failure detector, we have overcome the shortcomings of similar schemes proposed earlier. It realizes scalability by reducing context switching requirements and achieves faster failure detection . The membership management protocol handles membership issues with a worst case complexity of O(n) where n is the number of heartbeat groups. Note that n is much smaller than the total number of nodes in the Grid. The algorithm is also shown to be failure resilient and scalable.}} @techreport{RFC2981, Author = {Kavasseri, R. and Stewart, B.}, Date-Added = {2006-03-28 11:01:34 +0900}, Date-Modified = {2006-03-28 11:04:22 +0900}, Institution = {Internet Engineering Task Force (IETF)}, Keywords = {SNMP, RFC}, Language = {English}, Number = {2981}, Title = {Event {MIB}}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc2981.html}, Year = {2000}} @inproceedings{Yon05, Address = {Cyprus}, Author = {Yoneki, E.}, Booktitle = {Proceedings of the International Workshop on Grid Computing and its Application to Data Analysis}, Date-Added = {2006-03-23 16:51:41 +0900}, Date-Modified = {2006-05-01 17:29:10 +0900}, Keywords = {middleware}, Language = {English}, Month = {November}, Organization = {IFIP}, Title = {Event Broker Grids with Filtering, Aggregation, and Correlation for Wireless Sensor Data}, Year = {2005}} @techreport{JAF05, Address = {Madrid, Spain}, Author = {Jim{\'e}nez, E. and Ar{\'e}valo, S. and Fern{\'a}ndez, A.}, Date-Added = {2006-03-16 17:55:41 +0900}, Date-Modified = {2006-10-25 15:34:44 +0900}, Institution = {Universidad Rey Juan Carlos}, Keywords = {failure detection, omega, leader election}, Language = {English}, Number = {RoSaC-2005-2}, Title = {Implementing the {$\Omega$} Failure Detector with Unknown Membership and Weak Synchrony}, Url = {http://gsyc.escet.urjc.es/publicaciones/tr/RoSaC-2005-2.pdf}, Year = {2005}} @techreport{RFC3878, Author = {Chrisholm, S. and Romascanu, D.}, Date-Added = {2006-03-14 14:36:25 +0900}, Date-Modified = {2006-03-14 14:38:03 +0900}, Institution = {IETF}, Keywords = {SNMP}, Language = {English}, Number = {3877}, Title = {Alarm Management Information Base}, Type = {RFC}, Year = {2004}} @techreport{RFC3286, Author = {Ong, L. and Yoakum, J.}, Date-Added = {2006-03-08 15:32:39 +0900}, Date-Modified = {2006-03-08 15:35:36 +0900}, Institution = {IETF}, Keywords = {RFC, IP}, Language = {English}, Local-Url = {file://localhost/Users/wiesmann/Documents/Papers/00_others/rfc3286.pdf}, Number = {RFC3286}, Title = {An introduction to the stream control transmission protocol (SCTP)}, Type = {RFC}, Year = {2002}} @inproceedings{GLB97, Author = {Gemmel, J. and Liebeherr, J. and Basset, D.}, Booktitle = {Proceedings of the 6$^{th}$ International Conference on Computer Communications and Networks (ICCCN '97)}, Date-Added = {2005-12-07 13:09:47 +0900}, Date-Modified = {2006-08-01 14:18:57 +0900}, Keywords = {broadcast}, Language = {English}, Organization = {IEEE}, Pages = {60--65}, Title = {An API for Scalable Reliable Multicast}, Url = {http://doi.ieeecomputersociety.org/10.1109/ICCCN.1997.623291}, Year = {1997}} @inproceedings{SM01, Author = {Sotoma, I. and Madeira, E. R. M.}, Booktitle = {Proc. of the 3rd Int. Symp. on Distributed Objects and Applications ({DOA})}, Date-Added = {2005-11-10 14:17:43 +0900}, Date-Modified = {2006-08-01 14:00:21 +0900}, Keywords = {failure detection, CORBA}, Language = {English}, Pages = {219--229}, Title = {{ADAPTATION} - Algorithms to ADAPTive FAulT MonItOriNg and Their Implementation on {CORBA}}, Url = {http://doi.ieeecomputersociety.org/10.1109/DOA.2001.954087}, Year = {2001}, Abstract = {Abstract: This paper presents ADAPTATION-Algorithms to ADAPTive FAulT MonItOriNg for asynchronous distributed systems and their implementation on CORBA. Our algorithms vary the timeouts based on a recent history of last elapsed times of the monitoring messages. The aim of the proposed algorithms is to provide a better response time to crashes and a minimum discrepancy between a suspection due to the network overload and due to the real process crash. The proposed approach extends the Fault Tolerant CORBA OMG specification with the push model and the definition of pull and push ADAPTION fault monitors. Some ADAPTION experiments on ACE+TAO were made to observe their behavior on changing network workloads.}} @inproceedings{GMR05, Author = {Gorender, S. and Mac{\^e}do, R. and Raynal, M.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks ({DSN})}, Date-Added = {2005-11-03 10:48:03 +0900}, Date-Modified = {2006-07-15 12:06:54 +0900}, Isbn = {0-7695-2282-3}, Keywords = {distributed systems, Synchrony}, Language = {English}, Location = {Yokohama, Japan}, Month = {June}, Pages = {412--421}, Title = {A hybrid and Adaptive Model for Fault-Tolerant Distributed Computing}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2005.8}, Year = {2005}, Abstract = {The capability of dynamically adapting to distinct run-time conditions is an important issue when designing distributed systems where negotiated quality of service (QoS) cannot always be delivered between processes. Providing fault-tolerance for such dynamic environments is a challenging task. Considering such a context, this paper proposes an adaptive model for fault-tolerant distributed computing. This model encompasses both the synchronous model (where there are time bounds on processing speed and message delay) and the asynchronous model (where there is no time bound). To illustrate what can be done in this model and how to use it, the consensus problem is taken as a benchmark problem. An implementation of the model is also described. This implementation relies on a negotiated quality of service (QoS) for channels, that can be timely or untimely. Moreover, the QoS of a channel can be lost during the execution (i.e., dynamically modified from timely to untimely), thereby adding uncertainty into the system.}} @article{SS83, Author = {Schlichting, R. D. and Schneider, F. B.}, Date-Added = {2005-10-07 15:48:05 +0900}, Date-Modified = {2006-05-17 16:10:43 +0900}, Journal = {Computer Systems}, Keywords = {fault-tolerance}, Language = {English}, Number = {3}, Pages = {222--238}, Title = {Fail-Stop Processors: An Approach to Designing Fault-Tolerant Computing Systems}, Url = {http://citeseer.ist.psu.edu/schlichting83failstop.html}, Volume = {1}, Year = {1983}, Abstract = {A methodology that facilitates the design of fault-tolerant computing systems is presented. It is based on the notion of a fail- stop processor. Such a processor automatically halts in response to any internal failure and does so before the effects of that failure become visible. The problem of implementing processors that, with high probability, behave like fail-stop processors is addressed. Axiomatic program verification techniques are described for use in developing provably correct programs for fail- stop processors. The design of a process control system illustrates the use of our methodology. }} @misc{NetSNMP, Date-Added = {2005-10-05 11:06:37 +0900}, Date-Modified = {2005-10-05 11:13:56 +0900}, Howpublished = {http://net-snmp.sourceforge.net/}, Key = {NSNMP}, Keywords = {SNMP}, Language = {English}, Title = {The Net-{SNMP} Project}, Url = {http://net-snmp.sourceforge.net/}} @inproceedings{JSST05, Address = {Yokohama, Japan}, Author = {Janakiraman, G. J. and Santos, J. R. and Subhraveti, D. and Turner, Y.}, Booktitle = {Proceedings of the International Conference on Dependable Systems and Networks (DSN 2005)}, Date-Added = {2005-09-30 15:12:28 +0900}, Date-Modified = {2006-05-17 16:09:42 +0900}, Isbn = {0-7695-2282-3}, Keywords = {Checkpointing}, Language = {English}, Month = {June}, Organization = {IEEE}, Pages = {260--269}, Title = {Cruz: Application-Transparent Distributed Checkpoint-Restart on Standard Operating Systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2005.33}, Year = {2005}} @inproceedings{KPS+04, Author = {Kup{\v s}ys, A. and Pleisch, S. and Schiper, A. and Wiesmann, M.}, Booktitle = {Proceedings of the 3rd International Symposium on Network Computing and Applications (IEEE NCA04)}, Date-Added = {2005-09-30 14:54:41 +0900}, Date-Modified = {2006-08-01 12:07:45 +0900}, Keywords = {JMS, MOM, Java}, Language = {English}, Location = {Cambridge, MA, USA}, Organization = {IEEE}, Title = {Towards {JMS} compliant group communication - a semantic mapping.}, Url = {http://doi.ieeecomputersociety.org/10.1109/NCA.2004.1347770}, Year = {2004}, Abstract = {Group communication provides communication primitives with various semantics and their use greatly simplifies the development of highly available services. However, despite tremendous advances in research and numerous prototypes, group communication stays confined to small niches and academic prototypes. In contrast, message-oriented middleware such as the Java Message Service (JMS) is widely used, and has become a de-facto standard. We believe that the lack of a well-defined and easily understandable standard is the reason that hinders the deployment of group communication systems. Since JMS is a well-established technology, an interesting solution is to extend JMS adding group communication primitives to it. Foremost, this requires to extend the traditional semantics of group communication in order to take into account various features of JMS, e.g., durable/non-durable subscriptions and persistent/non-persistent messages. The resulting new group communication specification, together with the corresponding API, defines group communication primitives compatible with JMS. As such, it facilitates the acceptance of group communication by a larger community and provides a powerful environment for building fault-tolerant applications.}} @conference{OGP03, Address = {Seatle, WA, USA}, Author = {Oppenheimer, D. and Ganapathi, A. and Patterson, D. A.}, Booktitle = {Proceedings of the 4th Symposium on Internet Technologies and Systems (USITS `03)}, Date-Added = {2005-09-28 15:59:18 +0900}, Date-Modified = {2005-11-02 15:26:50 +0900}, Keywords = {fault-tolerance, Fault statistics}, Language = {English}, Month = {March}, Organization = {USENIX}, Pages = {1--16}, Title = {Why do Internet services fail, and what can be done about it?}, Url = {http://www.stanford.edu/~candea/teaching/cs444a-fall-2003/readings/oppenheimer-fail.pdf}, Year = {2003}, Abstract = {In 1986 Jim Gray published his landmark study of the causes of failures of Tandem systems and the techniques Tandem used to prevent such failures See J. Gray. Why do computers stop and what can be done about it? Symposium on Reliability in Distributed Software and Database Systems, 1986... Seventeen years later, Internet services have replaced fault-tolerant servers as the new kid on the 24x7-availability block. Using data from three large-scale Internet services, we analyzed the causes of their failures and the (potential) effectiveness of various techniques for preventing and mitigating service failure. We find that (1) operator error is the largest single cause of failures in two of the three services, (2) operator errors often take a long time to repair, (3) configuration errors are the largest category of operator errors, (4) failures in custom-written front-end software are significant, and (5) more extensive online testing and more thoroughly exposing and detecting component failures would reduce failure rates in at least one service. Qualitatively we find that improvement in the maintenance tools and systems used by service operations staff would decrease time to diagnose and repair problems.}} @conference{WNSS02, Address = {Grenoble, France}, Author = {Wansbrough, K. and Norrish, M. and Sewell, P. and Serjantov, A.}, Booktitle = {Proceedings ot the 11th European Symposium on Programming, Programming Languages and Systems (ESOP 2002)}, Date-Added = {2005-09-26 16:17:45 +0900}, Date-Modified = {2006-03-08 15:31:50 +0900}, Editor = {M{\'e}tayer, D. Le}, Issn = {0302-9743}, Keywords = {IP, ICMP, semantics, failure detection}, Language = {English}, Month = {April}, Pages = {278--294}, Publisher = {Springer-Verlag GmbH}, Title = {Timing {UDP}: Mechanized Semantics for Sockets, Threads, and Failures}, Url = {http://www.springerlink.com/(fns0kb55vt14zu45inlgjzql)/app/home/contribution.asp?referrer=parent&backto=issue,20,22;journal,1339,2200;linkingpublicationresults,1:105633,1}, Volume = {2305 / 2002}, Year = {2002}, Abstract = {This paper studies the semantics of failure in distributed programming. We present a semantic model for distributed programs that use the standard sockets interface; it covers message loss, host failure and temporary disconnection, and supports reasoning about distributed infrastructure.We consider interaction via the UDP and ICMP protocols. To do this, it has been necessary to: * construct an experimentally-validated post-hoc specification of the UDP/ICMP sockets interface; * develop a timed operational semantics with threads, as such programs are typically multithreaded and depend on timeouts; * model the behaviour of partial systems, making explicit the interactions that the infrastructure offers to applications; * integrate the above with semantics for an executable fragment of a programming language (OCaml) with OS library primitives; and * use tool support to manage complexity, mechanizing the model with the HOL theorem prover. We illustrate the whole with a module providing na{\"\i}ve heartbeat failure detection.}} @article{AT99, Author = {Aguilera, M. K. and Toueg, S.}, Date-Added = {2005-09-20 11:06:46 +0900}, Date-Modified = {2005-11-02 15:20:27 +0900}, Issn = {0097-5397}, Journal = {{SIAM} Journal on Computing}, Keywords = {failure detection, consensus}, Language = {English}, Month = {February}, Number = {3}, Pages = {890--903}, Title = {Failure Detection and Randomization: A Hybrid Approach to Solve Consensus}, Url = {http://www.hpl.hp.com/personal/Marcos_Aguilera/papers/hybrid-siam1998.pdf}, Volume = {28}, Year = {1999}, Abstract = {We present a consensus algorithm that combines unreliable failure detection and randomization, two well-known techniques for solving consensus in asynchronous systems with crash failures. This hybrid algorithm combines advantages from both approaches: it guarantees deterministic termination if the failure detector is accurate, and probabilistic termination otherwise. In executions with no failures or failure detector mistakes, the most likely ones in practice, consensus is reached in only two asynchronous rounds.}} @article{Raynal05, Author = {Raynal, M.}, Date-Added = {2005-09-20 10:46:44 +0900}, Date-Modified = {2005-11-02 15:24:18 +0900}, Journal = {ACM SIGACT News}, Keywords = {failure detection}, Language = {English}, Month = {March}, Number = {1}, Pages = {53--70}, Title = {A short introduction to failure detectors for Asynchronous Distributed Systems}, Url = {http://doi.acm.org/10.1145/1052796.1052806}, Volume = {36}, Year = {2005}, Abstract = {Since the first version of Chandra and Toueg's seminal paper titled ``Unreliable failure detectors for reliable distributed systems'' in 1991, the failure detector concept has been extensively studied and investigated. This is not at all surprising as failure detection is pervasive in the design, the analysis and the implementation of a lot of fault-tolerant distributed algorithms that constitute the core of distributed system middleware. The literature on this topic is mostly technical and appears mainly in theoretically inclined journals and conferences. The aim of this paper is to offer an introductory survey to the failure detector concept for readers who are not familiar with it and want to quickly understand its aim, its basic principles, its power and limitations. To attain this goal, the paper first describes the motivations that underlie the concept, and then surveys several distributed computing problems showing how they can be solved with the help of an appropriate failure detector. So, this short paper presents motivations, concepts, problems, definitions, and algorithms. It does not contain proofs. It is aimed at people who want to understand basics of failure detectors. }} @misc{X.690, Date-Added = {2005-09-19 16:40:26 +0900}, Date-Modified = {2005-10-07 16:14:21 +0900}, Howpublished = {ITU-T recommandation X.690}, Key = {X.690}, Keywords = {ASN.1, BER}, Language = {English}, Number = {X.690}, Organization = {ITU-T Information technology}, Title = {{ASN}.1 encoding rules: Specification of Basic Encoding Rules ({BER}), Canonical Encoding Rules ({CER}) and Distinguished Encoding Rules ({DER})}, Type = {recommandation}, Url = {http://www.itu.int/ITU-T/studygroups/com17/languages/X.690-0207.pdf}, Year = {2002}, Abstract = {This Recommendation | International Standard defines a set of Basic Encoding Rules (BER) that may be applied to values of types defined using the ASN.1 notation. Application of these encoding rules produces a transfer syntax for such values. It is implicit in the specification of these encoding rules that they are also used for decoding. This Recommendation | International Standard defines also a set of Distinguished Encoding Rules (DER) and a set of Canonical Encoding Rules (CER) both of which provide constraints on the Basic Encoding Rules (BER). The key difference between them is that DER uses the definite length form of encoding while CER uses the indefinite length form. DER is more suitable for the small encoded values, while CER is more suitable for the large ones. It is implicit in the specification of these encoding rules that they are also used for decoding. }} @inproceedings{MPR04, Author = {Most{\'e}faoui, A. and Powel, D. and Raynal, M.}, Booktitle = {Proc. of the 10$^{th}$ Pacific Rim Int. Symp. on Dependable Computing (PRDC)}, Date-Added = {2005-09-19 15:15:01 +0900}, Date-Modified = {2006-07-15 11:04:43 +0900}, Keywords = {failure detection}, Language = {English}, Location = {Papeete, Tahiti}, Month = {March}, Pages = {57--65}, Title = {A Hybrid Approach for Building Eventually Accurate Failure Detectors}, Url = {http://doi.ieeecomputersociety.org/10.1109/PRDC.2004.1276553}, Year = {2004}, Abstract = {Unreliable failure detectors introduced by Chandra and Toueg are abstract mechanisms that provide information about process crashes. On the one hand, failure detectors allow a statement of the minimal requirements on process failures that allow solutions to problems that cannot otherwise be solved in purely asynchronous systems. However, on the other hand, they cannot be implemented in such systems: their implementation requires that the underlying distributed system be enriched with additional assumptions. Classic failure detector implementations rely on additional synchrony assumptions such as partial synchrony. More recently, a new approach for implementing failure detectors has been proposed: it relies on behavioral properties on the flow of messages exchanged. This paper shows that these approaches are not antagonistic and can be advantageously combined. A hybrid protocol (the first to our knowledge) implementing failure detectors with eventual accuracy properties is presented. Interestingly, this protocol benefits from the best of both worlds in the sense that it converges (i.e., provides the required failure detector) as soon as either the system behaves synchronously or the required message exchange pattern is satisfied. This shows that, to expedite convergence, it can be interesting to consider that the underlying system can satisfy several alternative assumptions.}} @inproceedings{MMR03, Author = {Most{\'e}faoui, A. and Mourgaya, E. and Raynal, M.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN)}, Date-Added = {2005-09-19 15:05:17 +0900}, Date-Modified = {2006-07-15 11:05:13 +0900}, Isbn = {0-7695-1959-8/03}, Keywords = {failure detection}, Language = {English}, Month = {June}, Pages = {351--360}, Title = {Asynchronous implementation of failure detectors}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2003.1209946}, Year = {2003}, Abstract = {Unreliable failure detectors introduced by Chandra and Toueg are abstract mechanisms that provide information on process failures. On the one hand, failure detectors allow to state the minimal requirements on process failures that allow to solve problems that cannot be solved in purely asynchronous systems. But, on the other hand, they cannot be implemented in such systems: their implementation requires that the underlying distributed system be enriched with additional assumptions. The usual failure detector implementations rely on additional synchrony assumptions (e.g., partial synchrony). This paper proposes a new look at the implementation of failure detectors and more specifically at Chandra-Toueg's failure detectors. The proposed approach does not rely on synchrony assumptions (e.g., it allows the communication delays to always increase). It is based on a query-response mechanism and assumes that the query/response messages exchanged obey a pattern where the responses from some processes to a query arrive among the (n - f) first ones (n being the total number of processes, f the maximum number of them that can crash, with 1 f < n). When we consider the particular case f = 1, and the implementation of a failure detector of the class denoted S (the weakest class that allows to solve the consensus problem), the additional assumption the underlying system has to satisfy boils down to a simple channel property, namely, there is eventually a pair of processes (pi, pj) such that the channel connecting them is never the slowest among the channels connecting pi or pj to the other processes. A probabilistic analysis shows that this requirement is practically met in asynchronous distributed systems. }} @inproceedings{BCG+97, Author = {Bondavalli, S. and Chiaradonna, S. and Giandomenico, F. Di and Grandoni, F.}, Booktitle = {Proc. of the 27$^{th}$ Int. Symp. on Fault-Tolerant Computing ({FTCS})}, Date-Added = {2005-09-19 14:50:19 +0900}, Date-Modified = {2006-07-15 11:01:52 +0900}, Keywords = {reliability}, Pages = {354--362}, Title = {Discriminating Fault Rate and Persistency to Improve Fault Treatment}, Url = {http://doi.ieeecomputersociety.org/10.1109/FTCS.1997.614109}, Year = {1997}, Abstract = {In this paper the consolidate identification of faults, distinguished as transient or permanent/intermittent, is approached. Transient faults discrimination has long been performed in commercial systems: threshold-based techniques have been practiced for several years for this purpose. The present work aims to contribute to the usefulness of the count-and-threshold scheme, through the analysis of its behavior and the exploration of its effects on the system. To this goal, the scheme is mechanized as a device named a-count, endowed with a few controllable parameters. a-count tries to balance between two conflicting requirements: to keep in the system those components that have experienced just transient faults; and to remove quickly those affected by permanent or intermittent faults. Analytical models are derived, allowing detailed study of a-count's behaviour; the actual evaluation, in a range of configurations, is performed by standard tools, in terms of the delay in spotting faulty components and the probability of improperly blaming correct ones.}} @inproceedings{CRV95, Author = {Cosquer, F. J. N. and Rodrigues, L. and Ver{\'\i}ssimo, P.}, Booktitle = {Proc. of the 7$^{th}$ Int. Conf. on Parallel and Distributed Computing and Systems}, Date-Added = {2005-09-19 14:38:53 +0900}, Date-Modified = {2006-07-15 11:15:41 +0900}, Keywords = {failure detection}, Language = {English}, Location = {Washington D.C, USA}, Month = {October}, Title = {Using Tailored Failure Suspectors to Support Distributed Cooperative Applications}, Url = {http://www.navigators.di.fc.ul.pt/docs/abstracts/tailoredfs.html}, Year = {1995}, Abstract = {This paper presents an approach to effectively support cooperative applications using tailored failure suspectors. Using a group communication subsystem, it is shown how failure suspectors can be configured to model the requirements/semantics of cooperative applications thus avoiding ad-hoc system decisions. This approach is highly relevant in the context of large scale distributed systems like the Internet, where communication high variance and unpredictable delays increase the probability of incorrect failure detection. Applications are presented illustrating how failure suspectors are configured and possibly combined with new feedback techniques in order to implement more powerful cooperative environments.}} @inproceedings{DHJK04, Author = {Dunagan, J. and Harvey, N. J. A. and Jones, M. B. and Kosti, D. and Theimer, M. and Wolman, A.}, Booktitle = {Proc. of the 6$^{th}$ Symp. on Operating Systems Design and Implementation (OSDI)}, Date-Added = {2005-09-19 14:17:53 +0900}, Date-Modified = {2006-07-15 11:26:34 +0900}, Keywords = {failure detection, service}, Language = {English}, Pages = {151--166}, Title = {{FUSE}: Lightweight guaranteed distributed failure notification}, Url = {http://research.microsoft.com/research/sn/Herald/papers/FUSE.pdf}, Year = {2004}, Abstract = {FUSE is a lightweight failure notification service for building distributed systems. Distributed systems built with FUSE are guaranteed that failure notifications never fail. Whenever a failure notification is triggered, all live members of the FUSE group will hear a notification within a bounded period of time, irrespective of node or communication failures. In contrast to previous work on failure detection, the responsibility for deciding that a failure has occurred is shared between the FUSE service and the distributed application. This allows applications to implement their own definitions of failure. Our experience building a scalable distributed event delivery system on an overlay network has convinced us of the usefulness of this service. Our results demonstrate that the network costs of each FUSE group can be small; in particular, our overlay network implementation requires no additional liveness-verifying ping traffic beyond that already needed to maintain the overlay, making the steady state network load independent of the number of active FUSE groups.}} @inproceedings{MRR05, Address = {Las Vegas, NV, USA}, Author = {Mostefaoui, A. and Rajsbaum, S. and Raynal, M.}, Booktitle = {Proceedings of the twenty-fourth annual symposium on Principles of distributed computing}, Date-Added = {2005-09-19 11:18:48 +0900}, Date-Modified = {2005-09-19 11:30:00 +0900}, Isbn = {1-59593-994-2}, Language = {English}, Organization = {ACM SIGACT-SIGOPS}, Pages = {179--188}, Publisher = {ACM Press}, Title = {The combined power of conditions and failure detectors to solve asynchronous set agreement}, Url = {http://doi.acm.org/10.1145/1073814.1073848}, Year = {2005}, Abstract = {An approach to cope with the impossibility of solving agreement problems in asynchronous systems made up of n processes and prone to t process crashes is to use failure detectors. An orthogonal approach that has been used is to consider conditions that restrict the possible inputs to such a problem. This paper considers a system with both failure detectors and conditions. The aim is to identify the failure detector class that abstracts away the synchrony needed to solve k-set agreement for a given condition.Three main contributions are presented. The first is a new class of failure detectors denoted ?ty, 0 y t. The processes can invoke a primitive queryy(S) with a set of process ids S. Roughly speaking, queryy(S) returns true only when all processes in S have crashed, provided t-y<|S| t. It is shown that the classic Chandra and Toueg's failure detectors are incomparable to the ?ty failure detectors. The second contribution is a generic condition-based protocol for ?ty that solves k-set agreement. It can be instantiated with any (t-d)-legal condition C and solves k-set agreement for k=1+max(0,d-y); termination is guaranteed for inputs in C. (A condition is x-legal if and only if it can be used to solve x-fault tolerant asynchronous consensus.) A variant of the protocol that terminates always is described. Finally, a corresponding lower bound is presented showing that there is no ?ty-based k-set agreement protocol for (t-d)-legal conditions with k max(0,d-y).}} @misc{Voll01, Author = {Vollbrecht, R.}, Date-Added = {2005-09-08 17:44:13 +0900}, Date-Modified = {2005-09-08 17:46:42 +0900}, Howpublished = {White paper}, Keywords = {IPMI, SNMP}, Language = {English}, Month = {12}, Organization = {Intel}, Title = {The Telecom System View}, Url = {http://www.intel.com/design/cgservers/downloads/1_4_Telco_Alarm_Manager.pdf}, Year = {2001}} @misc{JMX99, Address = {901 San Antonia Road, Palo Alta, CA 94303 USA}, Date-Added = {2005-09-08 17:39:17 +0900}, Date-Modified = {2005-10-05 11:10:57 +0900}, Howpublished = {White Paper}, Key = {JMX99}, Keywords = {Java, SNMP}, Language = {English}, Month = {June}, Organization = {Sun Microsystem}, Title = {Java Management Extensions}, Type = {White paper}, Url = {http://java.sun.com/products/JavaManagement/wp/JMXwhitepaper.pdf}, Year = {1999}} @techreport{RFC1757, Author = {Waldbusser, S.}, Date-Added = {2005-09-08 17:26:26 +0900}, Date-Modified = {2005-09-08 17:27:59 +0900}, Institution = {IETF}, Language = {English}, Number = {1757}, Title = {Remote Network Monitoring Management Information Base}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc1757.html}, Year = {1995}} @techreport{RFC1697, Author = {Browner, D. and Purvy, R. and Daniels, A. and Yinkin, M and Smith, J.}, Date-Added = {2005-09-08 17:17:06 +0900}, Date-Modified = {2006-07-14 17:53:55 +0900}, Institution = {IETF}, Keywords = {SNMP, Database, RFC}, Language = {English}, Number = {1697}, Title = {Relational Database Management ({RDBMS}) System Management Information Based ({MIB})}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc1697.html}, Year = {1994}} @inproceedings{BFBW01, Address = {Nalecz{\'o}w}, Author = {Bubak, M. and Funika, W. and Balis, B. and Wism{\"u}ller, R.}, Booktitle = {Proceedings of the foruth International Conference Parallel Processing and Applied Mathematics PPAM 2001}, Date-Added = {2005-09-08 15:48:54 +0900}, Date-Modified = {2006-03-08 19:28:38 +0900}, Isbn = {0302-9743}, Keywords = {grid}, Language = {English}, Month = {September}, Pages = {307--315}, Title = {A Concept of Grid Application Monitoring}, Url = {http://www.springerlink.com/app/home/contribution.asp?wasp=1434aabde2be4ae082ceaa1b25e54778&referrer=parent&backto=issue,34,101;journal,1285,2169;linkingpublicationresults,1:105633,1}, Year = {2001}} @inproceedings{BS90, Address = {Cairo, Egypt}, Author = {Bondavalli, A. and Simoncini, L.}, Booktitle = {Second Workshop on Future Trends of Distributed Computing Systems}, Date-Added = {2005-09-05 15:28:41 +0900}, Date-Modified = {2005-11-02 15:21:57 +0900}, Isbn = {0-8186-2088-9}, Keywords = {failure detection}, Language = {English}, Month = {September}, Organization = {IEEE}, Pages = {47--53}, Title = {Failure classification with respect to detection}, Url = {http://ieeexplore.ieee.org/xpl/abs_free.jsp?arNumber=138293}, Year = {1990}} @article{JMNN98, Author = {Duarte, E. P. and Mansfield, G and Nanya, T. and Noguchi, S.}, Date-Added = {2005-08-26 16:20:00 +0900}, Date-Modified = {2006-07-15 11:28:35 +0900}, Issn = {1099-1190}, Journal = {Int. Journal of Network Management}, Keywords = {SNMP, Dependability}, Language = {English}, Month = {July}, Number = {4}, Pages = {244-253}, Title = {Improving the Dependability of Network Management Systems}, Url = {http://portal.acm.org/citation.cfm?id=336783}, Volume = {8}, Year = {1998}} @techreport{RFC792, Author = {Postel, J.}, Date-Added = {2005-08-26 13:53:23 +0900}, Date-Modified = {2005-08-26 13:54:32 +0900}, Institution = {IETF}, Keywords = {ICMP}, Language = {English}, Number = {792}, Title = {Internet Control Message Protocol}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc792.html}, Year = {1981}} @article{SDFK99, Author = {Stelling, P. and DeMatteis, C. and Foster, I. and Kesselman, C. and Lee, C. and von Laszewski, G.}, Date-Added = {2005-08-25 17:19:40 +0900}, Date-Modified = {2005-11-02 15:24:34 +0900}, Journal = {Cluster Computing}, Keywords = {failure detection, Grid}, Language = {English}, Month = {June}, Number = {2}, Pages = {117 - 128}, Title = {A fault detection service for wide area distributed computations}, Url = {http://www-unix.globus.org/ftppub/globus/papers/hbm.pdf}, Volume = {2}, Year = {1999}, Abstract = {The potential for faults in distributed computing systems is a significant complicating factor for application developers. While a variety of techniques exist for detecting and correcting faults, the implementation of these techniques in a particular context can be difficult. Hence, we propose a fault detection service designed to be incorporated, in a modular fashion, into distributed computing systems, tools, or applications. This service uses wellknown techniques based on unreliable fault detectors to detect and report component failure, while allowing the user to trade off timeliness of reporting against false positive rates. We describe the architecture of this service, report on experimental results that quantify its cost and accuracy, and describe its use in two applications, monitoring the status of system components of the GUSTO computational grid testbed and as part of the NetSolve networkenabled numerical solver.}} @inproceedings{WSGY00, Author = {Waheed, A. and Smith, W. and George, J. and Yan, J.}, Booktitle = {Proc. of the 5$^{th}$ Int. Workshop on Languages, Compilers, and Run-Time Systems for Scalable Computers ({LCR})}, Date-Added = {2005-08-25 17:05:27 +0900}, Date-Modified = {2006-07-15 12:05:02 +0900}, Keywords = {Grid, Monitoring, service}, Language = {English}, Location = {Rochester, NY, USA}, Month = {May}, Pages = {235}, Title = {An Infrastructure for Monitoring and Management in Computational Grids}, Url = {http://www.springerlink.com/app/home/contribution.asp?wasp=f0e270d3e22c4b70bf3cb5b5bc4eea63&referrer=parent&backto=issue,18,22;journal,1673,2146;linkingpublicationresults,1:105633,1}, Year = {2000}, Abstract = {We present the design and implementation of an infrastructure that enables monitoring of resources, services, and applications in a computational grid and provides a toolkit to help manage these entities when faults occur. This infrastructure builds on three basic monitoring components: sensors to perform measurements, actuators to perform actions, and an event service to communicate events between remote processes. We describe how we apply our infrastructure to support a grid service and an application: (1) the Globus Metacomputing Directory Service; and (2) a long-running and coarse-grained parameter study application. We use these application to show that our monitoring infrastructure is highly modular, conveniently retargettable, and extensible.}} @techreport{TAG+02, Author = {Tierney, B. and Aydt, R. and Gunter, D. and Smith, W. and Swany, M. and Taylor, V. and Wolski, R.}, Date-Added = {2005-08-25 16:29:24 +0900}, Date-Modified = {2006-05-17 16:41:16 +0900}, Institution = {Global Grid Forum / Grid Monitoring Architecture Working Group}, Keywords = {grid, monitoring}, Language = {English}, Title = {A Grid Monitoring Architecture}, Type = {Memo}, Url = {http://www-didc.lbl.gov/GGF-PERF/GMA-WG/papers/GWD-GP-16-2.pdf}, Year = {2002}, Abstract = {Large distributed systems such as Computational and Data Grids require that a substantial amount of monitoring data be collected for various tasks such as fault detection, performance analysis, performance tuning, performance prediction, and scheduling. Some tools are currently available and others are being developed for collecting and forwarding this data. The goal of this paper is to describe the major components of a Grid monitoring architecture and their essential interactions. By adopting standard terminology and describing the minimal specification to support required functionality, we hope to encourage the development of interoperable high- quality performance tools for the Grid. To motivate the Grid Monitoring Architecture (GMA) design and to guide implementation, we also present the characteristics that are critical to proper functioning of a performance monitoring system for the Grid. }} @article{MZH99, Author = {Martin-Flatin, J.-P. and Znaty, S. and Hubaux, J.-P.}, Date-Added = {2005-08-25 16:13:01 +0900}, Date-Modified = {2006-05-17 16:48:05 +0900}, Doi = {10.1023/A:1018761615354}, Issn = {1573-7705}, Journal = {Journal of Network and Systems Management}, Keywords = {distributed systems, management}, Language = {English}, Month = {March}, Number = {1}, Pages = {9 -- 26}, Title = {A Survey of Distributed Enterprise Network and Systems Management Paradigms}, Url = {http://scholar.google.com/url?sa=U&q=http://icawww.epfl.ch/Publications/Martin-Flatin/Martin-FlatinZH99.pdf}, Volume = {7}, Year = {1999}, Abstract = {Since the mid 1990s, network and systems management has steadily evolved from centralized paradigms, where the management application runs on asingle management station, to distributed paradigms,where it is distributed over many nodes. In this survey, our goal is to classify all these paradigms,especially the new ones, in order to help network andsystems administrators design a management application, and choose between mobile code, distributed objects, intelligent agents, etc. Step by step, we buildan enhanced taxonomy based on four criteria: the delegation granularity, the semantic richness of the information model, the degree of specificationof a task, and the degree of automation of management.}} @inproceedings{SAF00, Author = {Subramanyan, S. and Alonso, J. M. and Fortes, J. A. B.}, Booktitle = {Proc. of Supercomputing}, Date-Added = {2005-08-25 15:57:14 +0900}, Date-Modified = {2006-07-15 11:23:40 +0900}, Isbn = {0-7803-9802-5}, Keywords = {SNMP, monitoring, scalability}, Language = {English}, Location = {Dallas, Texas, USA}, Title = {A scalable {SNMP}-based distributed monitoring system for heterogeneous network computting}, Url = {http://www.supercomp.org/sc2000/Proceedings/techpapr/papers/pap280.pdf}, Year = {2000}, Abstract = {Traditional centralized monitoring systems do not scale to present-day large, complex, network- computing systems. Based on recent SNMP stan- dards for distributed management, this paper ad- dresses the scalability problem through distribution of monitoring tasks, applicable for tools such as SIMONE (SNMP-based monitoring prototype implemented by the authors). Distribution is achieved by introducing one or more levels of a dual entity called the Intermediate Level Manager (ILM) between a manager and the agents. The ILM accepts monitoring tasks described in the form of scripts and delegated by the next higher en- tity. The solution is exible and integratable into a SNMP tool without altering other system compo- nents. A testbed of up to 1024 monitoring elements is used to assess scalability. Noticeable improvements in the round trip delay (from seconds to less than tenth of a second) were observed when more than 200 monitor- ing elements are present and as few as 2 ILM's are used.}} @inproceedings{STD02, Author = {Su, M. S. and Thulasiraman, K. and Das, A.}, Booktitle = {Proc. of the Global Telecom. Conf. ({GLOBECOM})}, Date-Added = {2005-08-25 15:47:51 +0900}, Date-Modified = {2006-07-15 11:22:47 +0900}, Isbn = {0-7803-7632-3}, Keywords = {Fault diagnosis, SNMP}, Language = {English}, Month = {November}, Pages = {1960- 1964}, Title = {A Scalable On-Line Multilevel Distributed Network Fault Detection/Monitoring System Based on the {SNMP} Protocol}, Url = {http://www.sosu.edu/faculty/msu/gen-11-2.pdf}, Volume = {2}, Year = {2002}, Abstract = {Traditional centralized network management solutions do not scale to present-day large-scale computer/communication networks. Decentralization/distributed solutions can solve some of these problems (Goldszmidt, G. and Yemini, Y., 1995), and thus there is considerable interest in distributed/decentralized network management applications. We present the design and evaluation of an SNMP-based distributed network fault detection/monitoring system. We integrate into the SNMP framework our ML-ADSD algorithm (Su, M.-S. et al., Proc. 39th Annual Allerton Conf. on Commun., Control, and Computers, 2001; Su, Multilevel distributed diagnosis and the design of a distributed network fault detection system based on the SNMP protocol, Ph.D. Thesis, School of Computer Science, University of Oklahoma, 2002) for fault diagnosis in a distributed processor system. The algorithm uses the multilevel paradigm and requires only minor modifications to be scalable to networks of varying sizes. The system is fault tolerant, allowing processor failure and/or recovery during the diagnosis process. We have implemented the system on an Ethernet network of 32 machines. Our results show that the diagnosis latency (or time to termination) is much better than that of earlier solutions. Also, the system's bandwidth utilization is insignificant, demonstrating the practicality of its deployment in a real network. We have successfully integrated three modern disciplines: network management, distributed computing and system level diagnosis.}} @inproceedings{SIK02, Author = {Shinbo, H. and Idoue, A. and Kato, T.}, Booktitle = {Proc. of the 20$^{th}$ Int. Conf. on Applied Informatics ({AI})}, Date-Added = {2005-08-25 15:34:14 +0900}, Date-Modified = {2006-07-15 11:51:44 +0900}, Keywords = {failure detection, SNMP}, Language = {English}, Location = {Innsbruck, Austria}, Month = {February}, Pages = {351--482}, Title = {A Failure Detection Procedure for Internet based on Communication Retrial}, Url = {http://pads1.cs.nthu.edu.tw/~liaoweik/download/p149.pdf}, Year = {2002}, Abstract = {According to the spread of Internet, various Failures become serious problems. It is considered that Failures are difficult to find for the following reasons. One is that the Internet communication includes multiple server accesses with individual protocols, and failures may occur in any of them. Another is that the Internet consists of multiple networks, such as a LAN within a company and an Internet service provider network, and a failure occurred in one network is difficult to find by the outside networks. We are proposing a failure detection procedure by retrying a communication on which a user reported any problems. The purpose of our approach is to detect various Failures, such as failures caused by congestion and those caused by protocol procedures as well as a hardware failure. In this paper, we present the overview and the details of our approach. }} @inproceedings{YONK03, Address = {Prague}, Author = {Yoshikawa, T. and Ohta, K. and Nakagawa, T. and Kurakake, S.}, Booktitle = {Proceedings of the 14th International Workshop on Database and Expert Systems Applications (DEXA'03)}, Date-Added = {2005-08-25 15:19:04 +0900}, Date-Modified = {2005-08-25 15:23:36 +0900}, Isbn = {0-7695-1993-8}, Issn = {1529-4188}, Language = {English}, Month = {September}, Organization = {IEEE Computer Society}, Pages = {144-148}, Title = {Mobile Web Service Platform for Robust, Responsive Distributed Application}, Url = {http://doi.ieeecomputersociety.org/10.1109/DEXA.2003.1232014}, Year = {2003}, Abstract = {In mobile computing environments, distributed applications are provided over the wireless network that is unstable. The goal of our research is to offer service stability by rapid failure detection and recovery by switching to service management required by each client application through the cooperation of client middleware and the overlay network. This paper describes adaptive monitoring (AM), which detects failure rapidly with only a slight addition in network load, and Monitoring Information Notification Protocol (MINP), which transfer failure information efficiently. Experiments on a testbed hosting real Web Services confirm that our system can detect failure and switch to a service alternative within the recovery time demanded by most applications. We also confirm that the proposed method is effective in reducing network load as well as satisfying application requirements.}} @inproceedings{Beck91, Author = {Becker, T.}, Booktitle = {Proc. of the 10$^{th}$ Symp. on Reliable Distributed Systems (SRDS)}, Date-Added = {2005-08-25 14:44:46 +0900}, Date-Modified = {2006-07-15 11:08:24 +0900}, Keywords = {failure detection}, Language = {English}, Month = {September}, Pages = {198--205}, Title = {Keeping processes under surveillance}, Url = {http://ieeexplore.ieee.org/iel2/365/3900/00145424.pdf?isnumber=3900&prod=CNF&arnumber=145424&arSt=198&ared=205&arAuthor=Becker%2C+T.}, Year = {1991}, Abstract = {Fault tolerance in a distributed system can be achieved by replicating service processes and running them on different computer nodes. To preserve fault-tolerance during the life time of the system, for each process which has crashed, a new replica must be kept under surveillance to detect process or node crashes as soon as possible. In this paper we describe two solutions for the surveillance problem. Both approaches are based on an election algorithm which has to cope with process and communication failures. The election algorithm is presented in detail. }} @inproceedings{GM98, Author = {Garg, V. K and Mitchell, J. R.}, Booktitle = {Proc. of the 18th Conf. on Foundations of Software Technology and Theoretical Computer Science}, Date-Added = {2005-08-25 14:19:37 +0900}, Date-Modified = {2006-07-14 18:03:30 +0900}, Isbn = {3-540-65384-8}, Keywords = {failure detection}, Language = {English}, Publisher = {Springer-Verlag}, Title = {Implementable Failure Detectors in Asynchronous Systems}, Url = {http://www.springerlink.com/media/DFDQYMMYVN4Q70LXGNDM/Contributions/2/5/B/T/25BTWTBKC1GPB2T9.pdf}, Year = {1998}} @inproceedings{FRT01, Author = {Fetzer, C. and Raynal, M. and Tronel, F.}, Booktitle = {Proc. of the 8$^{th}$ Pacific Rim Symp. on Dependable Computing (PRDC)}, Date-Added = {2005-08-25 14:02:16 +0900}, Date-Modified = {2006-07-15 11:29:56 +0900}, Keywords = {failure detection}, Language = {English}, Location = {Seoul, Korea}, Pages = {146--153}, Title = {An adaptive Failure Detection Protocol}, Url = {http://doi.ieeecomputersociety.org/10.1109/PRDC.2001.992691}, Year = {2001}, Abstract = {The detection of process failures is a crucial problem system designers have to cope with in order to build faulttolerant distributed platforms. Unfortunately, it is impossible to distinguish with certainty a crashed process from a very slow process in a purely asynchronous distributed system. This prevents some problems to be solved in such systems. That is why failure detector oracles have been introduced to circumvent these impossibility results.}} @inproceedings{GLS95, Address = {Bad Neuenahr, Germany}, Author = {Guerraoui, R. and Larrea, M. and Schiper, A.}, Booktitle = {Proceedings of the 14th Symposium on Reliable Distributed Systems (SRDS-14)}, Date-Added = {2005-08-25 12:00:46 +0900}, Date-Modified = {2005-11-02 15:19:46 +0900}, Isbn = {0-8186-7153-X}, Keywords = {failure detection, atomic commitment}, Language = {English}, Month = {September}, Organization = {IEEE Computer Society}, Pages = {41 -- 50}, Publisher = {IEEE Computer Society}, Title = {Non Blocking Atomic Commitment with an Unreliable Failure Detector}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDIS.1995.518722}, Year = {1995}} @inproceedings{LFA00, Author = {Larrea, M. and Fernandez, A. and Arevalo, S.}, Booktitle = {Proc. of the 19$^{th}$ Symp. on Reliable Distributed Systems (SRDS)}, Date-Added = {2005-08-25 11:51:34 +0900}, Date-Modified = {2006-07-15 11:44:45 +0900}, Language = {English}, Location = {N{\"u}rnberg, Germany}, Pages = {52 -- 59}, Title = {Optimal Implementation of the Weakest Failure Detector for Solving Consensus}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDI.2000.885392}, Year = {2000}, Abstract = {The concept of unreliable failure detector was introduced by Chandra and Toueg as a mechanism that provides in-formation about process failures. Depending on the properties the failure detector guarantee, they proposed taxonomy of failure detectors. It has been shown that one of the classes of this taxonomy, namely eventually Strong (3 S), is the weakest class allowing solving the Consensus problem.In this paper, we present a new algorithm implementing 3 S. Our algorithm guarantees that eventually all the correct processes agree on a common correct process. This property trivially allows us to provide the accuracy and completeness properties required by 3 S. We show, then, that our algorithm is better than any other proposed implementation of 3 S in terms of the number of messages and the total amount of information periodically sent. In particular, previous algorithms require to periodically exchanging at least a quadratic amount of information, while ours only requires O(n log n) (where n is the number of processes).However, we also propose a new measure to evaluate the efficiency of this kind of algorithms, the eventual monitoring degree, which does not rely on a periodic behavior and expresses better the degree of processing required by the algorithms. We show that the runs of our algorithm have optimal eventual monitoring degree}} @inproceedings{NJ04, Author = {Nunes, R. C. and Jansch-P{\^o}rto, I.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN'04)}, Date-Added = {2005-08-25 11:44:16 +0900}, Date-Modified = {2006-11-06 17:29:18 +0900}, Isbn = {0-7695-2052-9}, Keywords = {failure detection}, Language = {English}, Title = {{QoS} of Timeout-Based Self-Tuned Failure Detectors: The Effects of the Communication Delay Predictor and the Safety Margin}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2004.1311946}, Year = {2004}} @inproceedings{FB05, Author = {Falai, L. and Bondavalli, A.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN'05)}, Date-Added = {2005-08-25 11:36:22 +0900}, Date-Modified = {2006-11-06 17:05:39 +0900}, Keywords = {failure detection, WAN, QoS}, Language = {English}, Location = {Yokohama, Japan}, Month = {June}, Pages = {624--633}, Title = {Experimental Evaluation of the {QoS} of Failure Detectors on Wide Area Network}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2005.47}, Year = {2005}, Abstract = {This paper describes an experiment performed on Wide Area Network to assess and fairly compare the Quality of Service provided by a large family of failure detectors. Failure detectors are a popular middleware mechanismused for improving the dependability of distributed systems and applications. Their QoS greatly influences the QoS that upper layers may provide. It is thus of uttermost importance to equip a system with an appropriate failure detector and to properly tune its parameters for the most desirable QoS to be provided. The paper first analyzes the QoS indicators and the structure of push-style failure detectors and then introduces the choices for estimators and safety margins used to build several (30) failure detectors. The experimental setup designed and implemented to allow a fair comparison of QoS of the several alternatives in a real representative experimental setting is then described. Finally the results obtained through the experiments and their interpretation are provided.}} @conference{GS96a, Address = {Bologna, Italy}, Author = {Guerraoui, R. and Schiper, A.}, Booktitle = {Proceedings of the 10th International Workshop on Distributed Algorithms (WDAG-10)}, Date-Added = {2005-08-25 11:07:49 +0900}, Date-Modified = {2006-04-11 10:25:46 +0900}, Editor = {Springer-Verlag}, Keywords = {failure detection}, Month = {October}, Series = {LNCS~1151}, Title = {Gamma-accurate failure detectors}, Url = {http://lsewww.epfl.ch/Publications/ById/32.html}, Year = {1996}} @book{MS01, Address = {Sebastopol, CA, USA}, Author = {Mauro, D. R. and Schmidt, K. J.}, Date-Added = {2005-08-25 10:58:21 +0900}, Date-Modified = {2005-08-25 11:02:57 +0900}, Isbn = {0-596-00020-0}, Keywords = {SNMP}, Language = {English}, Month = {july}, Publisher = {O'Reilly and Associates}, Title = {Essential SNMP}, Url = {http://www.unix.org.ua/orelly/networking_2ndEd/snmp/}, Year = {2001}} @book{Kretch03, Author = {Kretchmar, J.}, Date-Added = {2005-08-25 10:44:33 +0900}, Date-Modified = {2005-08-25 10:47:16 +0900}, Isbn = {0130462101}, Keywords = {Nagios}, Language = {English}, Month = {October}, Publisher = {Prentice Hall Professional Technical Reference}, Series = {Computer Networking and Distributed Systems}, Title = {Open Source Network Administration}, Year = {2003}} @incollection{Fink99, Author = {Finkel, R. A.}, Booktitle = {Software: Practice and Experience}, Date-Added = {2005-08-24 18:33:42 +0900}, Date-Modified = {2006-08-01 13:54:24 +0900}, Keywords = {failure detection, toolkit}, Language = {English}, Pages = {1163--1176}, Publisher = {Wiley Interscience}, Title = {Pulsar: an extensible tool for monitoring large Unix sites}, Url = {http://www3.interscience.wiley.com/cgi-bin/abstract/7264/ABSTRACT}, Volume = {27-10}, Year = {1997}} @inproceedings{AP97, Address = {San Diego, CA, USA}, Author = {Anderson, E. and Patterson, D.}, Booktitle = {Proceedings of the 11th USENIX conference on System administration {LISA '97}}, Date-Added = {2005-08-24 18:16:34 +0900}, Date-Modified = {2005-11-02 15:24:50 +0900}, Keywords = {failure detection}, Language = {English}, Organization = {USENIX}, Pages = {9--16}, Publisher = {USENIX}, Title = {Extensible, Scalable Monitoring for Clusters of Computers}, Url = {https://www.usenix.org/publications/library/proceedings/lisa97/full_papers/02.anderson/02.pdf}, Year = {1997}} @inproceedings{GCG01, Author = {Gupta, I. and Chandra, T. D. and Goldszmidt, G.}, Booktitle = {Proc. of the 20$^{th}$ Annual Symp. on Principles of Distributed Computing (PODC)}, Date-Added = {2005-08-24 18:04:39 +0900}, Date-Modified = {2006-07-15 11:50:33 +0900}, Keywords = {failure detection, scalability}, Language = {English}, Pages = {170-179}, Title = {On Scalable and Efficient Distributed Failure Detectors}, Url = {http://www.cs.cornell.edu/gupta/podc2001.final.ps}, Year = {2001}} @inproceedings{DFKM97, Author = {Dolev, D. and Friedman, R. and Keidar, I. and Malkhi, D.}, Booktitle = {Proc. of the Symp. on Principles of Distributed Computing (PODC)}, Date-Added = {2005-08-24 17:57:08 +0900}, Date-Modified = {2006-07-14 17:58:23 +0900}, Keywords = {failure detection}, Language = {English}, Month = {September}, Pages = {286--302}, Title = {Failure Detectors in Omission Failure Environments}, Url = {http://citeseer.ist.psu.edu/dolev96failure.html}, Year = {1997}, Abstract = {We study failure detectors in an asynchronous environment that admits message omission failures. In such environments, processes may fail by crashing, but may also disconnect from each other. We adapt Chandra and Toueg's definitions of failure detection completeness and accuracy to the omissions failure model, and define a weak failure detector ?W(om) that allows any majority of the processes that become connected to reach a Consensus decision, despite any number of transient communication failures in their past. We provide a protocol that solves the Consensus problem in this model whenever a majority of the processes become connected, regardless of past omissions. Moreover, in our protocol it is not necessary to save and repeatedly send all past messages, which makes it more efficient than previous protocols in this model.}} @inproceedings{SBCF03, Author = {Sampaio, M. R. and Brasileiro, F. V. and Cirne, W. and de Figueiro, J. C. A.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN)}, Date-Added = {2005-08-24 17:26:29 +0900}, Date-Modified = {2006-07-15 10:51:12 +0900}, Keywords = {failure detection}, Language = {English}, Location = {San Francisco, CA, USA}, Month = {June}, Pages = {551--561}, Title = {How Bad Are Wrong Suspicions? Towards Adaptative Distributed Protocols}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2003.1209965}, Year = {2003}} @inproceedings{BMS03, Author = {Bertier, M. and Marin, O. and Sens, P.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks ({DSN})}, Date-Added = {2005-08-16 19:03:40 +0900}, Date-Modified = {2006-07-15 12:11:08 +0900}, Isbn = {0-7695-1952-0}, Keywords = {failure detection}, Language = {English}, Location = {San Francisco, CA, USA}, Month = {June}, Pages = {635--644}, Title = {Performance analysis of a hierarchical failure detector}, Url = {http://citeseer.ist.psu.edu/674456.html}, Year = {2003}, Abstract = {We present a new failure detector implementation. This implementation, a variant of the heartbeat failure detector, is both adaptable and designed for scalability. Its first specificity of our implementation lies in the fact that it is designed as a shared service among several applications by way of an adaptation layer between the failure detector and the application. This layer adapts the quality of service according to application needs. The second specificity is the hierarchic organization of the detection service: it allows to decrease the number of message and the processor load. Through an experimentation evaluation, we show that our implementation is adaptable to the environment characteristics and usable with large scale applications. }} @inproceedings{ACT97b, Author = {Aguilera, M. K. and Chen, W. and Toueg, S.}, Booktitle = {Workshop on Distributed Algorithms}, Date-Added = {2005-08-16 18:54:33 +0900}, Date-Modified = {2005-11-02 15:21:20 +0900}, Keywords = {failure detection}, Language = {English}, Pages = {126--140}, Title = {Heartbeat: A Timeout-Free Failure Detector for Quiescent Reliable Communication}, Url = {http://citeseer.ist.psu.edu/article/aguilera97heartbeat.html}, Year = {1997}} @inproceedings{RMH98, Author = {van Renesse, R. and Minsky, Y. and Hayden, M.}, Booktitle = {Proc. of the Int. Conf. on Distributed Systems Platforms and Open Distributed Processing (Middleware)}, Date-Added = {2005-08-16 18:40:18 +0900}, Date-Modified = {2006-07-15 12:14:33 +0900}, Editor = {Davies, N. and Raymond, K. and Seitz, J.}, Keywords = {failure detection, service}, Language = {English}, Location = {The Lake District, UK}, Month = {September}, Pages = {55--70}, Title = {A Gossip-Based Failure Detection Service}, Url = {http://www.cs.cornell.edu/Info/People/rvr/papers/GossipFD.pdf}, Year = {1998}, Abstract = {Failure Detection is valuable for system management, replication, load balancing, and other distributed services. To date, Failure Detection Services scale badly in the number of members that are being monitored. This paper describes a new protocol based on gossiping that does scale well and provides timely detection. We analyze the protocol, and then extend it to discover and leverage the underlying network topology for much improved resource utilization. We then combine it with another protocol, based on broadcast, that is used to handle partition failures. }} @inproceedings{CTA00, Address = {New York, NY, USA}, Author = {Chen, W. and Toueg, S. and Aguilera, M. K.}, Booktitle = {Proceedings of the International Conference on Dependable Systems and Networks (DSN 2000)}, Date-Added = {2005-08-15 14:50:48 +0900}, Date-Modified = {2005-11-02 15:22:12 +0900}, Isbn = {0-7695-0707-7}, Keywords = {failure detection}, Language = {English}, Month = {june}, Pages = {191--202}, Publisher = {IEEE Computer Society Press}, Title = {On the Quality of Service of Failure Detectors}, Url = {http://citeseer.ist.psu.edu/chen00quality.html}, Year = {2000}} @techreport{RFC2925, Author = {White, K.}, Date-Added = {2005-08-15 14:23:53 +0900}, Date-Modified = {2006-03-08 15:29:00 +0900}, Institution = {IETF}, Keywords = {SNMP, RFC}, Language = {English}, Number = {2925}, Title = {Definitions of Managed Objects for Remote Ping, Traceroute, and Lookup Operations}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc2925.html}, Year = {2000}, Abstract = {This memo defines Management Information Bases (MIBs) for performing remote ping, traceroute and lookup operations at a remote host. When managing a network it is useful to be able to initiate and retrieve the results of ping or traceroute operations when performed at a remote host. A Lookup capability is defined in order to enable resolving of either an IP address to an DNS name or an DNS name to an IP address at a remote host. Currently, there are several enterprise-specific MIBs for performing remote ping or traceroute operations. The purpose of this memo is to define a standards-based solution to enable interoperability.}} @techreport{RFC2573, Author = {Levi, D. and Meyer, P. and Stewart, B.}, Date-Added = {2005-08-15 12:06:16 +0900}, Date-Modified = {2006-03-08 15:17:09 +0900}, Institution = {IETF}, Keywords = {SNMP, RFC}, Language = {English}, Number = {2573}, Title = {{SNMP} Applications}, Type = {RFC}, Url = {http://www.faqs.org/rfcs/rfc2573.html}, Year = {1999}} @techreport{RFC2790, Author = {Waldbusser, S. and Grillo, P.}, Date-Added = {2005-07-15 17:33:56 +0900}, Date-Modified = {2006-03-08 15:17:04 +0900}, Institution = {Internet Engineering Task Force (IETF)}, Keywords = {SNMP, RFC}, Language = {English}, Number = {2790}, Title = {Host Resources {MIB}}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc2790.txt}, Year = {2000}} @article{DUS04, Author = {D{\'e}fago, X. and Urb{\'a}n, P. and Schiper, A.}, Date-Added = {2005-07-08 15:47:44 +0900}, Date-Modified = {2005-11-02 15:18:04 +0900}, Journal = {ACM Computing Surveys,}, Keywords = {atomic broadcast}, Language = {English}, Month = {December}, Number = {4}, Pages = {372--421}, Title = {Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey}, Url = {http://scholar.google.com/url?sa=U&q=http://portal.acm.org/ft_gateway.cfm%3Fid%3D1041682%26type%3Dpdf}, Volume = {36}, Year = {2004}} @inproceedings{HDYK04, Author = {Hayashibara, N. and D{\'e}fago, X. and Yared, R. and Katayama, T.}, Booktitle = {Proc. of the 23$^{rd}$ Int. Symp. on Reliable Distributed Systems (SRDS'04)}, Date-Added = {2005-07-06 14:08:27 +0900}, Date-Modified = {2006-11-06 17:06:54 +0900}, Keywords = {failure detection}, Language = {English}, Local-Url = {file://localhost/Users/wiesmann/Documents/Papers/00_others/HDYK04.pdf}, Location = {Florianpolis, Brazil}, Month = {October}, Pages = {66--78}, Title = {The $\Phi$ Accrual Failure Detector}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDIS.2004.1353004}, Year = {2004}} @inproceedings{WMS04, Address = {Leuven, Belgium}, Author = {Wilkes, J. and Mogul, J. and Suermondt, J.}, Booktitle = {Proceedings of the 11th ACM SIGOPS European Workshop}, Date-Added = {2005-07-06 11:52:01 +0900}, Date-Modified = {2005-11-02 15:36:36 +0900}, Keywords = {virtualisation, utility computing}, Language = {English}, Month = {september}, Organization = {ACM}, Title = {Utilification}, Url = {http://www.hpl.hp.com/techreports/2004/HPL-2004-124.pdf}, Year = {2004}} @inproceedings{Murray05, Author = {Murray, P.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN)}, Date-Added = {2005-07-06 11:25:45 +0900}, Date-Modified = {2006-07-15 10:47:37 +0900}, Keywords = {failure detection, toolkit, service}, Language = {English}, Local-Url = {file://localhost/Users/wiesmann/Documents/Papers/00_others/Murray05.pdf}, Location = {Yokohama, Japan}, Month = {June}, Pages = {200--205}, Title = {A Distributed State Monitoring Service for Adaptative Management}, Url = {http://doi.ieeecomputersociety.org/10.1109/DSN.2005.6}, Year = {2005}, Abstract = {Anubis is a simple state monitoring service that supports coordinated action among distributed management agents. It uses a temporal consistency model that addresses symmetric and asymmetric network partitions. We have used Anubis to support distributed management of adaptive applications in Grid and Utility computing environments and our experience has shown that the abstraction and properties provided by the service simplify the task of programming distributed management behavior. We support this claim by examining three common use cases that our developers encountered, namely: resource management, lifecycle coordination, and compositional failure management.}} @inproceedings{BMS02, Author = {Bertier, M. and Marin, O. and Sens, P.}, Booktitle = {Proceedings of the 2002 International Conference on Dependable Systems and Networks (DSN)}, Date-Added = {2005-06-22 18:16:11 +0900}, Date-Modified = {2005-11-02 15:21:42 +0900}, Isbn = {0-7695-1597-5}, Keywords = {failure detection}, Language = {English}, Organization = {IEEE Computer Society}, Pages = {354 -- 363}, Title = {Implementation and Performance Evaluation of an Adaptable Failure Detector}, Url = {http://portal.acm.org/citation.cfm?id=738261}, Year = {2002}} @inproceedings{AR04, Author = {de Ara{\'u}jo Mac{\^e}do, R. and e Lima, F. Ramon Lima}, Booktitle = {Anais do 22o. Simp{\'o}sio Brasileiro de Redes de Computadores}, Date-Added = {2005-06-20 12:46:05 +0900}, Date-Modified = {2006-07-14 17:56:37 +0900}, Keywords = {SNMP, failure detection}, Language = {English}, Local-Url = {/Users/wiesmann/Documents/Papers/00_others/AR04.pdf}, Location = {Gramado, RS, Brazil}, Pages = {583--586}, Title = {Improving the Quality of Service of Failure Detectors with {SNMP} and Artificial Neural Networks}, Url = {http://www.lasid.ufba.br/public/artigos/2164.pdf}, Year = {2004}} @inproceedings{DUHK05, Author = {D{\'e}fago, X. and Urb{\'a}n, P. and Hayashibara, N. and Katayama, T.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN'05)}, Date-Added = {2005-06-13 15:14:42 +0900}, Date-Modified = {2006-11-06 17:10:46 +0900}, Keywords = {failure detection}, Language = {English}, Pages = {206--215}, Title = {Definition and Specification of Accrual Failure Detectors}, Url = {http://ddsg.jaist.ac.jp/en/pub/DUH+05.html}, Year = {2005}} @article{SRC84, Author = {Saltzer, J. H. and Reed, D. P. and Clark, D. D.}, Date-Modified = {2006-05-17 16:46:48 +0900}, Journal = {ACM Transactions on Computer Systems}, Keywords = {end-to-end}, Language = {English}, Month = {November}, Number = {4}, Pages = {277--288}, Title = {End-to-End Arguments in System Design}, Url = {http://doi.acm.org/10.1145/357401.357402}, Volume = {2}, Year = {1984}, Abstract = {This paper presents a design principle that helps guide placement of functions among the modules of a distributed computer system. The principle, called the end-to-end argument, suggests that functions placed at low levels of a system may be redundant or of little value when compared with the cost of providing them at that low level. Examples discussed in the paper include bit-error recovery, security using encryption, duplicate message suppression, recovery from system crashes, and delivery acknowledgment. Low-level mechanisms to support these functions are justified only as performance enhance- ments. }} @article{HSAA03, Author = {Holliday, J. and R. Steinke, D. Agrawal and El~Abbadi, A.}, Date-Modified = {2006-03-08 15:26:14 +0900}, Journal = {IEEE Transactions on Knowledge and Data Engineering}, Keywords = {Database, replication, epidemic algorithm}, Language = {English}, Month = {september-october}, Number = {5}, Pages = {1218--1238}, Title = {Epidemic algorithms in replicated databases}, Url = {http://csdl.computer.org/comp/trans/tk/2003/05/k1218abs.htm}, Volume = {15}, Year = {2003}} @mastersthesis{Mul04, Address = {Switzerland}, Author = {M{\"u}ller, M.}, Date-Modified = {2006-08-01 13:49:36 +0900}, Keywords = {SNMP, failure detection}, Language = {English}, Month = {February}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Performance evaluation of a failure detector using {SNMP}}, Type = {Semester Project}, Url = {http://infoscience.epfl.ch/search.py?recid=49897}, Year = {2004}} @book{TS02, Author = {Tanenbaum, A. S. and van Steen, M.}, Date-Modified = {2006-08-01 14:08:34 +0900}, Isbn = {0-13-088893-1}, Keywords = {distributed systems}, Publisher = {Prentice Hall}, Title = {Distributed Systems: Principles and Paradigms}, Year = {2002}} @inbook{MWF+04, Author = {Montresor, A. and Wiesmann, M. and Fahrenholtz, D. and Jimenez-Peris, R. and Patino-Martinez, M.}, Chapter = {Group communication}, Date-Modified = {2006-08-01 13:13:35 +0900}, Editor = {Romanovsky, A.}, Keywords = {group communication}, Language = {English}, Number = {10}, Pages = {77--81}, Publisher = {Cabernet, Information Society Technologies (IST)}, Title = {Cabernet Vision of Research and Technology Development in Distributed and Dependable Systems}, Url = {http://www.newcastle.research.ec.org/cabernet/research/rtd/final/vision-2004.pdf}, Year = {2004}} @article{PGS03, Author = {Pedone, F. and Guerraoui, R. and Schiper, A.}, Journal = {Distributed and Parallel Databases}, Keywords = {Database replication, Dragon}, Language = {English}, Month = {July}, Number = {1}, Pages = {71--98}, Title = {The Database State Machine Approach}, Url = {http://www.kluweronline.com/article.asp?PIPS=5117548}, Volume = {14}, Year = {2003}} @article{PRO03, Author = {Pereira, J. and Rodrigues, L. and Oliveira, R.}, Date-Modified = {2006-08-23 15:02:02 +0900}, Journal = {IEEE Transactions on Computers}, Keywords = {reliable broadcast, group communication}, Language = {English}, Month = {February}, Number = {2}, Pages = {150--165}, Title = {Semantically Reliable Multicast: Definition, Implementation and Performance Evaluation}, Url = {http://doi.ieeecomputersociety.org/10.1109/TC.2003.1176983}, Volume = {52}, Year = {2003}, Abstract = {Semantic Reliability is a novel correctness criterion for multicast protocols based on the concept of message obsolescence: A message becomes obsolete when its content or purpose is superseded by a subsequent message. By exploiting obsolescence, a reliable multicast protocol may drop irrelevant messages to find additional buffer space for new messages. This makes the multicast protocol more resilient to transient performance perturbations of group members, thus improving throughput stability. This paper describes our experience in developing a suite of semantically reliable protocols. It summarizes the motivation, definition, and algorithmic issues and presents performance figures obtained with a running implementation. The data obtained experimentally is compared with analytic and simulation models. This comparison allows us to confirm the validity of these models and the usefulness of the approach. Finally, the paper reports the application of our prototype to distributed multiplayer games.}, Annote = {Special Issue on Reliable Distributed Systems}} @inproceedings{MPR03, Address = {Florence, Italy}, Author = {Monteiro, M. J. and Pereira, J. and Rodrigues, L.}, Booktitle = {Proceedings of International Workshop on Large-Scale Group Communication}, Keywords = {group communication, application}, Language = {English}, Title = {Integration of Flight Simulator 2002 with an epidemic multicast protocol}, Url = {http://www.di.fc.ul.pt/~ler/reports/Srds03Workshop.pdf}, Year = {2003}} @book{Lynch96, Address = {San Francisco, CS}, Author = {Lynch, N.}, Date-Modified = {2006-08-01 14:05:27 +0900}, Isbn = {1-55860-348-4}, Keywords = {distributed systems}, Language = {English}, Publisher = {Morgan Kaufmann}, Title = {Distributed Algorithms}, Url = {http://theory.lcs.mit.edu/tds/distalgs.html}, Year = {1996}} @unpublished{AAA96, Author = {Agrawal, D. and Alonso, G. and El~Abbadi, A.}, Date-Modified = {2006-03-08 19:18:49 +0900}, Institution = {Internal Report}, Keywords = {database, broadcast, replication}, Language = {English}, Note = {Sketch for~\cite{AAES97}}, Title = {Broadcasting in Replicated Databases}, Year = {1996}} @techreport{AAAS96, Address = {Santa Barbara, California USA}, Alternate-Key = {Agrawal96}, Author = {Agrawal, D. and Alonso, G. and {El Abbadi}, A. and Stanoi, I.}, Date-Modified = {2006-04-11 10:35:46 +0900}, Institution = {Department of Computer Science, University of California}, Keywords = {atomic broadcast, database replication}, Language = {English}, Title = {Exploiting Atomic Broadcast in Replicated Databases}, Year = {1996}} @inproceedings{AAES97, Address = {Passau (Germany)}, Alt-Url = {http://www.cs.ucsb.edu/~ioana/europar97.ps}, Alternate-Key = {Agrawal97}, Author = {Agrawal, D. and Alonso, G. and {El~Abbadi}, A. and Stanoi, I.}, Booktitle = {Proceedings of EuroPar ({EuroPar}'97)}, Date-Modified = {2006-05-17 17:04:20 +0900}, Institution = {University of California at Santa Barbara and Swiss Federal Institute of Technology}, Keywords = {Dragon, group communication,database replication}, Language = {English}, Title = {Exploiting Atomic Broadcast in Replicated Databases}, Url = {http://www.inf.ethz.ch/personal/alonso/PAPERS/EPar97.ps.Z}, Year = {1997}} @inproceedings{AAS97, Address = {Tucson, Arizona {USA}}, Author = {Agrawal, D. and Abbadi, A.~El and Steinke, R.~C.}, Booktitle = {{PODS} '97. Proceedings of the Sixteenth {ACM} {SIG-SIGMOD-SIGART} Symposium on Principles of Database Systems}, Editor = {ACM}, Keywords = {algorithms; performance}, Language = {English}, Month = {12--15}, Pages = {161--172}, Publisher = {{ACM} Press}, Title = {Epidemic algorithms in replicated databases (extended abstract)}, Url = {http://www.acm.org:80/pubs/articles/proceedings/pods/263661/p161-agrawal/p161-agrawal.pdf}, Year = {1997}} @inproceedings{ABKW98, Address = {Seattle, Washington}, Author = {Anderson, T. and Breitbart, Y. and Korth, H.~F. and Wool, A.}, Booktitle = {Proceedings of the 1998 {ACM} {SIGMOD}}, Date-Modified = {2006-03-08 15:24:45 +0900}, Keywords = {Database, Simulation}, Language = {English}, Title = {Replication, consistency, and practicality: are these mutually exclusive?}, Url = {http://www.bell-labs.com/user/yash/sigmod98.ps}, Year = {1998}} @article{ACL87, Author = {Agrawal, R. and Carey, M.~J. and Livny, M.}, Date-Modified = {2006-05-17 17:03:26 +0900}, Journal = {ACM Transactions on Database Systems}, Keywords = {Database, modeling}, Language = {English}, Number = {4}, Pages = {609--654}, Title = {Concurency Control Performance Modeling: Alternatives and Implications}, Url = {http://www.acm.org/pubs/articles/journals/tods/1987-12-4/p609-agrawal/p609-agrawal.pdf}, Volume = {12}, Year = {1987}} @article{ACT00, Alt-Url = {http://www.cs.cornell.edu/home/sam/FDpapers/crash-recovery-finaldcversion.ps}, Author = {Aguilera, M.~K. and Chen, W. and Toueg, S.}, Date-Modified = {2006-09-15 15:15:13 +0900}, Issn = {0178-2770}, Journal = {Distributed Computing}, Keywords = {consensus, failure detection}, Language = {English}, Number = {2}, Pages = {99--125}, Publisher = {Springer}, Title = {Failure Detection and Consensus in the Crash Recovery Model}, Url = {http://link.springer.de/link/service/journals/00446/papers/0013002/00130099.pdf}, Volume = {13}, Year = {2000}} @techreport{ACT97, Author = {Aguilera, M.~K. and Chen, W. and Toueg, S.}, Date-Modified = {2006-03-08 19:22:08 +0900}, Institution = {Cornell University, Computer Science Department}, Keywords = {consensus, reliable broadcast}, Language = {English}, Number = {TR97-1632}, Pages = {24}, Title = {Quiescent Reliable Communication and Quiescent Consensus in Partitionable Networks}, Year = {1997}} @inproceedings{ACT98, Address = {Andros, Greece}, Author = {Aguilera, M.~K. and Chen, W. and Toueg, S.}, Booktitle = {Proceedings of the $12^{th}$ International Symposium on Distributed Computing (DISC'1998 fomerly {WDAG})}, Date-Modified = {2006-03-08 19:37:37 +0900}, Isbn = {3-540-65066-9}, Keywords = {failure detection}, Language = {English}, Pages = {231--245}, Publisher = {Springer Verlag}, Series = {Lecture Notes in Computer Science}, Title = {Failure Detection and Consensus in the Crash Recovery Model}, Volume = {1499}, Year = {98}} @inproceedings{ADKM92, Address = {Boston, Massachusetts, {USA}}, Author = {Amir, Y. and Dolev, D. and Kramer, S. and Malki, D.}, Booktitle = {$22^{nd}$ Annual International Symposium on Fault-Tolerant Computing (FTCS)}, Date-Modified = {2006-06-01 15:47:04 +0200}, Isbn = {0-8186-2875-8}, Keywords = {toolkit, group communication}, Language = {English}, Organization = {{IEEE} Computer Society}, Pages = {76--84}, Title = {{T}ransis: {A} Communication Sub-System for High Availability}, Url = {http://www.cs.jhu.edu/~yairamir/ftcs-22.ps.gz}, Year = {1992}} @techreport{ADMSM94, Author = {Amir, Y. and Dolev, D. and Melliar-Smith, P. M. and Moser, L. E.}, Date-Modified = {2006-03-08 19:17:32 +0900}, Institution = {The Hebrew University of Jerusalem, Institute of Computer Science}, Keywords = {group communication, replication}, Language = {English}, Number = {CS94-20}, Title = {Robust and Efficient Replication using Group Communication}, Year = {1994}} @techreport{ADS02, Address = {Israel}, Author = {Anker, T. and Dolev, D. and Shnayderman, I.}, Date-Modified = {2006-03-08 19:24:58 +0900}, Institution = {The Hebrew University of Jerusalem}, Keywords = {group membership, group communication}, Language = {English}, Number = {2002-21}, Title = {Ad Hoc Membership for Scalable Applications}, Url = {http://leibniz.cs.huji.ac.il/research/abstract.php?abstract=479}, Year = {2002}} @inproceedings{AGG00, Address = {Taipe, Taiwan, R.O.C}, Author = {Amiri, K.~A. and Gibson, G.~A. and Golding, R.}, Booktitle = {Proceedings of $20^{th}$ International Conference on Distributed Computing Systems ({ICDCS}'2000)}, Editor = {IEEE}, Language = {English}, Pages = {298--307}, Publisher = {IEEE Computer}, Title = {Highly concurent shared storage}, Year = {2000}} @inproceedings{AHC95, Author = {Al-Houmaily, Y.~J. and Chrysanthis., P.~K.}, Booktitle = {Proceedings of the $8^{th}$ {ISCA} International Conference on Parallel and Distributed Computing Systems}, Language = {English}, Pages = {554--560}, Title = {Two-Phase Commit in Gigabit-Networked Distributed Databases}, Url = {ftp://ftp.cs.pitt.edu/panos/PMDBnet/pdcs_95.ps}, Year = {1995}} @inproceedings{AHCL97, Address = {Birmingham, {U.K.}}, Author = {Al-Houmaily, Y.~J. and Chrysanthis, P.~K. and Levitan., S.}, Booktitle = {Proceedings of the $3^{th}$ {IEEE} International Conference on Data Engineering}, Date-Modified = {2006-03-08 19:18:15 +0900}, Keywords = {atomic commitment, 2pc}, Language = {English}, Pages = {255--265}, Title = {An Argument in Favor of Presumed Commit Protocol}, Url = {ftp://ftp.cs.pitt.edu/panos/PMDBnet/icde_97.ps.gz}, Year = {1997}} @article{AJB00, Author = {Albert, R. and Jeong, H. and Barab\'{a}si, A.~L.}, Journal = {Nature}, Language = {English}, Pages = {378--382}, Publisher = {Macmillan Magazines Ltd.}, Title = {Error and attack tolerance of complex networks}, Url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v406/n6794/abs/406378a0_fs.html}, Volume = {406}, Year = {2000}} @inproceedings{AKAAGM96, Address = {New Orleans}, Alt-Url = {http://www.almaden.ibm.com/u/mohan/RJ9970.pdf}, Author = {Alonso, G. and Kamath, M. and Agrawal, D. and El~Abbadi, A. and G{\"u}nth{\"o}r, R. and Mohan, C.}, Booktitle = {Proceedings of the International Conference on Data Engineering}, Date-Modified = {2006-03-08 19:17:04 +0900}, Keywords = {transaction processing}, Language = {English}, Title = {Advanced Transaction Models in the Workflow Contexts}, Url = {http://www.almaden.ibm.com/u/mohan/ICDE96.pdf}, Year = {1996}} @inproceedings{AL97, Address = {Santa Barbara {CA}}, Author = {Adya, A. and Liskov, B.}, Booktitle = {Proceedings of the {ACM} Symposium of Distributed Computing ({PODC'97})}, Date-Modified = {2006-03-08 19:33:15 +0900}, Editor = {{ACM}}, Keywords = {consistency, lazy replication}, Language = {English}, Pages = {73--82}, Title = {Lazy Conistency Using Loosely Synchronized Clocks}, Url = {http://www.pmg.lcs.mit.edu/papers/podc97/lc.html}, Year = {1997}} @inproceedings{ALO00, Address = {San Diego, CA, USA}, Author = {Adya, A. and Liskov, B. and O'Neil, P. E.}, Booktitle = {Proceedings of the IEEE International Conference on Data Engineering (ICDE)}, Date-Modified = {2006-03-08 19:18:26 +0900}, Keywords = {database, consistency}, Language = {English}, Pages = {67--78}, Title = {Generalized Isolation Level Definitions}, Url = {http://www.pmg.lcs.mit.edu/~adya/pubs/published.pdf}, Year = {2000}} @manual{ANSI92, Address = {1819 L Street, NW, Washington, DC 20036, USA}, Date-Modified = {2006-08-01 14:03:20 +0900}, Key = {ANSI92}, Keywords = {transaction processing}, Language = {English}, Organization = {American National Standart for Information Systems}, Title = {{ANSI} {X3}.135-1992 -- Database Language {SQL}}, Year = {1992}} @article{AS87, Author = {Alpern, B. and Schneider, F.~B.}, Journal = {Distributed Computing}, Keywords = {omega}, Language = {English}, Pages = {117--126}, Title = {Recognizing safety and liveness}, Volume = {2}, Year = {1987}} @article{AW94, Author = {Attiya, H. and Welch, J.}, Date-Modified = {2006-03-08 19:01:17 +0900}, Keywords = {consistency}, Language = {English}, Number = {2}, Pages = {91--122}, Title = {Sequential Consistency versus Linearizability}, Url = {http://www.acm.org/pubs/articles/journals/tocs/1994-12-2/p91-attiya/p91-attiya.pdf}, Volume = {12}, Year = {1994}} @techreport{AZ93, Author = {Acharya, S. and Zdonik, S. B.}, Institution = {Department of Computer Science, Brown University}, Language = {English}, Number = {CS-93-43}, Title = {An Efficient Scheme for Dynamic Data Replication}, Url = {ftp://ftp.cs.brown.edu/pub/techreports/93/cs93-43.ps.Z}, Year = {1993}} @inproceedings{Alonso97, Address = {Zinal (Valais, Switzerland)}, Author = {Alonso, G.}, Author-Url = {http://www.inf.ethz.ch/personal/alonso/}, Booktitle = {Proceedings of the $2^{nd}$ European Research Seminar on Advances in Distributed Systems ({ERSADS}'97)}, Date-Modified = {2006-03-08 19:17:19 +0900}, Keywords = {database, replication, group communication}, Language = {English}, Pages = {171--176}, Title = {Partial Database Replication and Group Communication Primitives (Extended Abstract)}, Url = {http://www.inf.ethz.ch/personal/alonso/PAPERS/ERSDAS97.ps.Z}, Year = {1997}} @inproceedings{Alsberg76, Author = {Alsberg, P. A. and Day, J. D.}, Booktitle = {Proceedings of the International Conference on Software Engineering}, Language = {English}, Title = {A Principle for Resilient Sharing of Distributed Resources}, Year = {1976}} @inproceedings{Anc93, Address = {Lisbon, Portugal}, Author = {Anceaume, E.}, Booktitle = {Proceedings of the $4^{th}$ Computer Society Workshop on Future Trends in Distributed Computing Systems (FTDCS-4)}, Date-Modified = {2006-03-08 19:21:46 +0900}, Keywords = {atomic broadcast}, Organization = {IEEE}, Pages = {166--172}, Title = {A Comparison of Fault-Tolerant Atomic Broadcast Protocols}, Year = {1993}} @techreport{Asilomar98, Address = {One Microsoft Way, Redmond, {WA} 98052}, Author = {Bernstein, P. and Brodie, M. and Ceri, S. and DeWitt, D. and Franklin, M. and Garcia-Molina, H. and Gray, J. and Held, J. and Hellerstein, J. and Jagadish, H. V and Lesk, M. and Maier, D. and Naughton, J. and Pirahesh, H. and Stonebraker, M. and Ullman, J.}, Date-Modified = {2006-03-08 19:23:04 +0900}, Institution = {Microsoft Research}, Key = {Asilomar98}, Keywords = {database}, Language = {English}, Number = {MSR-TR-98-57}, Title = {The {A}silomar Report on Database Research}, Url = {http://www.research.microsoft.com/scripts/pubdb/pubsasp.asp?RecordID=196}, Year = {1998}} @article{BAC+81, Author = {Blasgen, M.~W. and Astrahan, M.~M. and Chamberlin, D.~D. and Gray, J.~N. and King, W.~F. and Lindsay, B.~G. and Lorie, R.~A. and Mehl, J.~W. and Price, T.~G. and Putzolu, G.~R. and Schkolnick, M. and Selinger, P.~G. and Slutz, D.~R. and Strong, H.~R. and Traiger, I.~L. and Wade, B.~W. and Yost, R.~A.}, Date-Modified = {2006-04-11 10:36:00 +0900}, Issn = {0018-8670}, Journal = {{IBM} Systems Journal}, Keywords = {database}, Language = {English}, Number = {1}, Pages = {41--62}, Title = {{System R}: An architectural overview}, Volume = {20}, Year = {1981}} @article{BBG89, Author = {Beeri, C. and Bernstein, P. A. and Goodman, N.}, Date-Modified = {2006-03-08 19:01:32 +0900}, Institution = {Jerusalem, Isr}, Issn = {0004-5411}, Keywords = {concurency control}, Language = {English}, Number = {2}, Pages = {230--269}, Title = {A Model for Concurrency in Nested Transaction Systems}, Url = {http://www.acm.org/pubs/articles/journals/jacm/1989-36-2/p230-beeri/p230-beeri.pdf}, Volume = {36}, Year = {1989}, Annote = {A serializability theory for nested transactions and{\newline} for multi-level database systems.}} @inproceedings{BBPV00, Address = {Cairo, Egypt}, Alt-Url = {http://www.vldb.org/conf/2000/P011.pdf}, Author = {Bobineau, C. and Bouganim, L. and Pucheral, P. and Valduriez, P.}, Booktitle = {Proceedings of the $26^{th}$ International Conference on Very Large Databases}, Isbn = {1-55860-715-3}, Language = {English}, Pages = {11--20}, Publisher = {Morgan Kaufmann}, Title = {{PicoDBMS}: Scaling down Database Techniques for the Smartcard}, Url = {http://dbms3.uta.edu/vldb2000/papers/RP02.pdf}, Year = {2000}} @inproceedings{BC96, Address = {France}, Author = {Banerjee, S. and Chrysanthis, P.~K.}, Booktitle = {Proceedings of the $9^{th}$ International Conference on Parallel and Distributed Computing Systems ({PDCS})}, Date-Modified = {2006-08-01 14:00:43 +0900}, Keywords = {database replication, recovery}, Language = {English}, Pages = {684--689}, Publisher = {{IEEE} Computer}, Title = {A Fast and Robust Failure Recovery Scheme for Shared-Nothing Gigabit Networked Databases}, Url = {ftp://ftp.cs.pitt.edu/panos/PMDBnet/pdcs_96.ps.gz}, Year = {1996}} @inproceedings{BC97, Address = {New Orleans, {USA}}, Author = {Banerjee, S. and Chrysanthis, P.~K.}, Booktitle = {Proceedings of the $10^{th}$ International Conference on Parallel and Distributed Systems ({PDCS})}, Date-Modified = {2006-03-08 19:22:32 +0900}, Keywords = {atomic commitment, 2pc, group communication}, Language = {English}, Organization = {{IEEE} Computer}, Pages = {428--432}, Title = {Performance Evaluation of the Group Two-Phase Locking Protocol}, Url = {ftp://violet.tele.pitt.edu/pub/Telecom_Faculty/Banerjee/Papers/PDCS-97.ps}, Year = {1997}} @techreport{BCBT96, Address = {Ithaca NY 14853 {USA}}, Author = {Basu, A. and Charron-Bost, B. and Toueg, S.}, Date-Modified = {2006-03-08 19:24:14 +0900}, Institution = {Cornell University, Computer Science Departement}, Keywords = {group communication}, Language = {English}, Number = {TR06-1609}, Title = {Solving Problems in the presence of process crashes and lossy links}, Url = {ftp://ftp.cs.cornell.edu/pub/sam/crash.link.failures.ps}, Year = {1996}} @inproceedings{BCH+00, Address = {Hilton Head, South Carolina {USA}}, Author = {Birman, K. and Constable, R. and Hayden, M. and Kreitz, C. and Rodeh, O. and van~Renesse, R. and Vogels, W.}, Booktitle = {Proceedings of the {DARPA} Information Survivability Conference \& Exposition (DISCEX '00)}, Date-Modified = {2006-03-08 19:26:38 +0900}, Keywords = {group communication, toolkit}, Language = {English}, Title = {The {H}orus and {E}nsemble Projects: Accomplishments and Limitations}, Url = {http://www.cs.cornell.edu/Info/Projects/Spinglass/public_pdfs/Horus%20and%20Ensemble.pdf}, Year = {2000}} @inproceedings{BCH+98, Address = {Santa Fe, New Mexico, USA}, Affiliation = {New York University and Lucent Technologies, Bell Laboratories}, Author = {Baratloo, A. and Chung, P.~E. and Huang, Y.~H. and Rangarajan, S. and Yajnik, S.}, Booktitle = {Proceedings of the $4^{th}$ Conference on Object Oriented Technologies and Systems ({COOTS})}, Date-Modified = {2006-03-08 19:23:18 +0900}, Keywords = {replication, java}, Language = {English}, Organization = {USENIX}, Pages = {59--63}, Title = {Filterfresh: Hot Replication of Java {RMI} Server Objects}, Url = {http://www.usenix.org/publications/library/proceedings/coots98/baratloo.html}, Year = {1998}} @article{BFG01, Author = {Boichat, R. and Fr{\o}lund, S. and Guerraoui, R.}, Date-Modified = {2006-03-08 19:01:41 +0900}, Journal = {Concurency and Computation: Practice and Experience}, Keywords = {consensus}, Language = {English}, Title = {Open Consensus}, Year = {2001}} @inproceedings{BG00, Author = {Boichat, R. and Guerraoui, R.}, Booktitle = {Proc. of 19$^{th}$ Symp. on Reliable Distr. Syst. (SRDS)}, Date-Modified = {2006-09-15 15:13:04 +0900}, Isbn = {0-7695-0543-0}, Issn = {1060-9857}, Keywords = {group communication, broadcast, reliable broadcast, crash recovery}, Language = {English}, Location = {N{\"{u}}rnberg, Germany}, Organization = {IEEE}, Pages = {32--41}, Title = {Reliable Broadcast in the Crash Recovery Model}, Year = {2000}, Abstract = {This paper addresses the problem of broadcasting messages in a reliable manner, within a practical asynchronous system where processes and channels may crash and recover. In this crash-recovery model, we present meaningful specifications of reliable broadcast and we describe algorithms that implement those specifications. Our approach is modular and incremental. It is modular in the sense that we give the properties of reliable broadcast separately and then consider their composition. It is incremental in the sense that we show how to automatically transform any reliable broadcast algorithm that implements a given specification into one that implements a stronger specification. In particular, we show how to reuse, in a crash-recovery model, reliable broadcast algorithms that were initially designed in a simpler crash-stop model.}} @inproceedings{BG92, Address = {San Diego, {CA} {USA}}, Author = {den Bussche, J. Van and Gucht, D. Van}, Booktitle = {Proceedings of the eleventh {ACM} symposium on Principles of database systems}, Date-Modified = {2006-03-08 19:27:48 +0900}, Keywords = {determinism}, Language = {English}, Organization = {ACM SIGACT-SIGMOD-SIGART}, Pages = {191--201}, Title = {Semi-determinism (extended abstract)}, Url = {http://www.acm.org/pubs/articles/proceedings/pods/137097/p191-van_den_bussche/p191-van_den_bussche.pdf}, Year = {1992}} @unpublished{BG92b, Author = {den Bussche, J. Van and Gucht, D. Van}, Date-Modified = {2006-03-08 19:33:27 +0900}, Keywords = {determinism}, Language = {English}, Note = {Extended and revised version of~\cite{BG92}}, Title = {{A} semi-deterministic approach to object creation and non-determinism in database queries}, Url = {ftp://wins.uia.ac.be/pub/good/semidet.ps.Z}} @inproceedings{BG93, Address = {Newport Beach, {CA}}, Author = {Becker, T. and Grieger, K.}, Booktitle = {Proceedings of the 7th International Parallel Processing Symposium}, Date-Modified = {2006-03-08 19:23:56 +0900}, Editor = {Prasanna, V. K.}, Isbn = {0-8186-3442-1}, Keywords = {atomic broadcast}, Language = {English}, Pages = {816--823}, Publisher = {{IEEE} Computer Society Press}, Title = {An efficient atomic multicast protocol for client-server models}, Year = {1993}} @inproceedings{BGHJ92, Author = {Bhide, A. and Goyal, A. and Hsiao, H. and Jhingran, A.}, Booktitle = {Proceedings of 1992 SIGMOD International Conference on Management of Data}, Pages = {236--245}, Title = {An Efficient Scheme for Providing High Availability}, Year = {1992}} @article{BGMS92, Author = {Breitbart, Y. and Garcia-Molina, H. and Silberschatz, A.}, Date-Modified = {2006-05-17 16:34:04 +0900}, Editor = {Schek, H. J.}, Journal = {The {VLDB} Journal}, Keywords = {Multidatabase, serializability, recovery, reliability, transaction processing}, Language = {English}, Number = {2}, Pages = {181--239}, Publisher = {{VLDB} Endowment}, Title = {Overview of Multidatabase Transaction Management}, Volume = {1}, Year = {1992}} @inproceedings{BGRS00, Abstract-Url = {http://link.springer.de/link/service/series/0558/bibs/1900/19000435.htm}, Address = {M{\"{u}}nich, Germany}, Affiliation = {Database Research Group, Institute of Information Systems, ETH Zentrum, CH-8092 Z{\"{u}}rich Switzerland}, Author = {B{\"o}hm, K and R{\"o}m, To Grabs U. and Schek, H.-J.}, Booktitle = {Proceedings of the $6^{th}$ International Euro-Par Conference}, Date-Modified = {2006-03-08 19:20:01 +0900}, Editor = {Bode, A. and Ludwig~II, T. and Karl, W. and Wism{\"{u}}ller, R.}, Isbn = {3-540-67956-1}, Keywords = {database, replication}, Language = {English}, Pages = {435--440}, Publisher = {Springer Verlag}, Title = {Evaluating the Coordination Overhead of Replica Maintenance in a Cluster of Databases}, Url = {http://link.springer.de/link/service/series/0558/papers/1900/19000435.pdf}, Volume = {1900}, Year = {2000}} @book{BHG87, Author = {Bernstein, P. and Hadzilacos, V. and Goodman, N.}, Date-Modified = {2006-03-08 19:25:24 +0900}, Isbn = {0-201-10715-5}, Keywords = {concurency control}, Language = {English}, Publisher = {Addison-Wesley}, Title = {Concurrency Control and Recovery in Database Systems}, Url = {http://research.microsoft.com/pubs/ccontrol/}, Year = {1987}} @inproceedings{BHJL86, Address = {Portland, OR USA}, Author = {Black, A. and Hutchinson, N. and Jul, E. and Levy, H}, Booktitle = {Conference proceedings on Object-oriented programming systems, languages and applications}, Language = {English}, Pages = {78--86}, Title = {Object structure in the Emerald system}, Url = {http://www.acm.org/pubs/articles/proceedings/oops/28697/p78-black/p78-black.pdf}, Year = {1986}} @inproceedings{BJ87, Address = {Austin, {TX}, {USA}}, Author = {Birman, K. P. and Joseph, T. A.}, Booktitle = {Proceedings of the $11^{th}$ {ACM} Symposium on {OS} Principles}, Date-Modified = {2006-05-17 16:48:05 +0900}, Keywords = {distributed systems, toolkit, group communication}, Language = {English}, Organization = {{ACM} {SIGOPS}}, Pages = {123--138}, Publisher = {{ACM}}, Title = {Exploiting virtual synchrony in distributed systems}, Year = {1987}} @inproceedings{BK97, Address = {Tucson, {AZ} {USA}}, Author = {Breitbart, Y. and Korth, H. F.}, Booktitle = {Proceedings of the sixteenth {ACM} {SIGACT-SIGMOD-SIGART} symposium on Principles of database systems}, Date-Modified = {2006-03-08 15:18:37 +0900}, Keywords = {performance, verification}, Language = {English}, Organization = {ACM SIGACT-SIGMOD-SIGART}, Pages = {173--184}, Title = {Replication and consistency being lazy helps sometimes}, Url = {http://www.acm.org/pubs/articles/proceedings/pods/263661/p173-breitbart/p173-breitbart.pdf}, Year = {1997}} @techreport{BKB00, Address = {Mura Anteo Zamboni 7, 40127 Bologna (Italy)}, Author = {Bartoli, A. and Kemme, B. and Babao{\u{g}}lu, {\"O}.}, Date-Modified = {2006-03-08 19:23:36 +0900}, Institution = {Departement of Computer Science, University of Bologna}, Keywords = {database, recovery, replication}, Language = {English}, Number = {UBLCS-2000-17}, Title = {Online Reconfiguration in Replicated Databases Based on Group Communications}, Url = {ftp://ftp.cs.unibo.it/pub/techreports/2000-17.ps.gz}, Year = {2000}} @article{BKT92, Author = {Bal, H.~E. and Kaashoek, M.~F. and Tanenbaum, A.~S.}, Date-Modified = {2006-03-08 19:14:45 +0900}, Journal = {IEEE Transactions on Software Engineering}, Keywords = {language, distributed computing}, Language = {English}, Number = {3}, Pages = {190--205}, Title = {{Orca}: a language for parallel programming of distributed systems}, Url = {http://citeseer.nj.nec.com/bal92orca.html}, Volume = {18}, Year = {1992}} @article{BLW93, Author = {Banerjee, S. and Li, V.~O.~K. and Wang, C.}, Date-Modified = {2006-03-08 19:02:40 +0900}, Journal = {{IEEE} Journal on Selected Areas in Communications}, Keywords = {database, distributed computing}, Language = {English}, Note = {Special Issue on Gigabit Network Protocols and Applications}, Number = {4}, Pages = {617--630}, Title = {Distributed Database Systems in High Speed Wide-Area Networks}, Url = {ftp://violet.tele.pitt.edu/pub/Telecom_Faculty/Banerjee/Papers/JSAC93.ps}, Volume = {11}, Year = {1993}} @article{BMD93, Author = {Barborak, M. and Malek, M. and Dahbura, A.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Journal = {{ACM} Computing Surveys}, Keywords = {consensus, byzantine agreement, distributed systems, fault-tolerance}, Language = {English}, Number = {2}, Pages = {171--220}, Title = {The Consensus Problem in Fault-Tolerant Computing}, Url = {http://www.acm.org/pubs/articles/journals/surveys/1993-25-2/p171-barborak/p171-barborak.pdf}, Volume = {25}, Year = {1993}} @inbook{BMST93, Author = {Budhiraja, N. and Marzullo, K. and Schneider, F.~B. and Toueg, S.}, Chapter = {8 -- The primary-backup approach}, Date-Modified = {2006-03-08 19:28:21 +0900}, Edition = {second}, Editor = {Mullender, S.}, Isbn = {0-201-62427-3}, Keywords = {database, replication, distributed computing}, Language = {English}, Pages = {199--216}, Publisher = {Addison-Wesley}, Series = {ACM Press}, Title = {Distributed Systems}, Year = {1993}} @article{BN01, Affilitation = {Arizona State University, Tempe, Arizona and Intel Corporation, Hillsboro, Oregon}, Author = {Bazzi, R.~A. and Neiger, G.}, Date-Modified = {2006-03-08 19:06:36 +0900}, Issn = {0004-5411}, Keywords = {distributed computing}, Language = {English}, Number = {3}, Pages = {499--554}, Title = {Simplifying Fault-Tolerance: providing the abstraction of crash failures}, Volume = {48}, Year = {2001}} @article{BR97, Author = {Borbst, S. and Robertson, O.}, Date-Modified = {2006-10-24 13:41:06 +0900}, Journal = {DBMS}, Keywords = {Database, large-scale}, Language = {English}, Number = {2}, Title = {Taming Data Giants}, Url = {http://www.dbmsmag.com/9702d13.html}, Volume = {10}, Year = {1997}} @article{BSR80, Author = {Bernstein, P. A. and Shipman, D. W. and Rothnie, J. B.}, Date-Modified = {2006-03-08 19:02:58 +0900}, Keywords = {database, concurency control, distributed computing}, Language = {English}, Number = {1}, Pages = {18--51}, Title = {Concurrency Control in a System for Distributed Databases ({SDD}-1)}, Volume = {5}, Year = {1980}} @article{BSS91, Author = {Birman, K.~P. and Schiper, A. and Stephenson, P.}, Date-Modified = {2006-03-08 19:03:19 +0900}, Keywords = {broadcast, causal order}, Language = {English}, Number = {3}, Pages = {272--314}, Title = {Light\-weight Causal and Atomic Group Multicast}, Url = {http://www.acm.org/pubs/articles/journals/tocs/1991-9-3/p272-schiper/p272-schiper.pdf}, Volume = {9}, Year = {1991}} @inproceedings{BST90, Address = {Atlantic City, NJ, USA}, Author = {Breitbart, Y. and Silberschatz, A. and Thompson, G.~R.}, Booktitle = {Proceedings of {ACM}-{SIGMOD} 1990 International Conference on Management of Data}, Date-Modified = {2006-10-17 09:56:27 +0900}, Issn = {0163-5808}, Keywords = {federated database}, Language = {English}, Organization = {ACM Special Interest Group on Management of Data}, Pages = {215--224}, Publisher = {Fort Collins Computer Center}, Title = {Reliable transaction management in a multidatabase system}, Volume = {19}, Year = {1990}} @techreport{BT93, Address = {5 Piazza di Porta S. Donato, 40127 Bologna (Italy)}, Author = {Babao{\u{g}}lu, {\"O}. and Toueg, S.}, Date-Modified = {2006-03-08 19:19:17 +0900}, Institution = {Laboratory for Computer Science, University of Bologna}, Keywords = {atomic commitment}, Language = {English}, Number = {UBLCS-93-2}, Title = {Understanding Non-Blocking Atomic Commitement}, Url = {ftp://ftp.cs.unibo.it/pub/TR/UBLCS/atomic-commitment.ps.gz}, Year = {1993}} @inproceedings{BWK85, Address = {Stockholm}, Author = {Bancilhon, J. and Won, K. and Korth, H.}, Booktitle = {Proceedings of the $11^{th}$ VLDB Conference}, Date-Modified = {2006-03-08 19:16:54 +0900}, Keywords = {transaction processing}, Language = {English}, Title = {A Model of {CAD} Transactions}, Year = {1985}} @inproceedings{Becker94, Address = {Pittsburg {PA}}, Author = {Becker, T.}, Booktitle = {Proceedings $2^{nd}$ International on Configurable Distributed Systems}, Date-Modified = {2006-03-08 19:25:11 +0900}, Keywords = {distributed computing}, Language = {English}, Pages = {36--45}, Title = {Application-Transparent Fault Tolerance in Distributed Systems}, Year = {1994}} @inproceedings{Ben83, Address = {Montr{\'e}al, Quebec, Canada}, Author = {Ben-Or, M.}, Booktitle = {Proceedings of the Second Annual Symposium on Principles of Distributed Computing}, Date-Modified = {2006-03-08 19:26:09 +0900}, Isbn = {0-89791-110-5}, Keywords = {time model}, Language = {English}, Organization = {ACM}, Pages = {27--30}, Title = {Another Advantage of free choice: Completely asynchronous agreement protocols}, Url = {http://portal.acm.org/citation.cfm?doid=800221.806707}, Year = {1983}, Abstract = {Recently, Fischer, Lynch and Paterson [3] proved that no completely asynchronous consensus protocol can tolerate even a single unannounced process death. We exhibit here a probabilistic solution for this problem, which guarantees that as long as a majority of the processes continues to operate, a decision will be made (Theorem 1). Our solution is completely asynchronous and is rather strong: As in [4], it is guaranteed to work with probability 1 even against an adversary scheduler who knows all about the system.}} @article{Berenson95, Author = {Berenson, H. and Bernstein, P. and Gray, J. and Melton, J. and O'Neil, E. and O'Neil, P.}, Date-Modified = {2006-03-08 19:02:01 +0900}, Journal = {{SIGMOD} Record ({ACM} Special Interest Group on Management of Data)}, Keywords = {consistency}, Language = {English}, Number = {2}, Pages = {1--10}, Title = {A critique of {ANSI} {SQL} isolation levels}, Volume = {24}, Year = {1995}} @inproceedings{Bernstein97, Address = {Ulm, Germany}, Author = {Bernstein, P.}, Booktitle = {Datenbankesysteme in {B}{\"{u}}ro, {T}echnik, und {W}issenschaft 97 Conference}, Editor = {Dittrich, K. R. and Geppert, A.}, Isbn = {ISBN 3-540-62569-0}, Language = {English}, Organization = {Gesellschaft fr Informatik - {GI} (German Computer Society)}, Pages = {34--36}, Publisher = {Informatik Actuell, Springer}, Title = {Repositories and Object Oriented Databases}, Year = {1997}} @inproceedings{Bernstein97b, Address = {Asilomar, {CA}.}, Author = {Bernstein, P.}, Booktitle = {Proceedings of the Seventh International Workshop on High Performance Transaction Systems}, Date-Modified = {2006-03-08 19:25:55 +0900}, Keywords = {transaction processing, consistency}, Language = {English}, Pages = {13--16}, Title = {Transactions and Serializability}, Year = {1997}} @article{Bhargava99, Author = {Bhargava, B.}, Date-Modified = {2006-03-08 19:02:09 +0900}, Journal = {{IEEE} Transaction on knowledge and Data Engineering}, Keywords = {concurency control}, Language = {English}, Number = {1}, Pages = {3--16}, Title = {Concurency Control in Database Systems}, Volume = {11}, Year = {1999}} @article{Birman00, Author = {Birman, K. P.}, Date-Modified = {2006-05-17 16:34:37 +0900}, Journal = {Computer}, Keywords = {security, fault-tolerance, view synchrony}, Language = {English}, Number = {8}, Pages = {54--60}, Title = {The Next Generation Internet: Unsafe at Any Speed}, Volume = {33}, Year = {2000}} @techreport{Birman91b, Author = {Birman, K.~P.}, Date-Modified = {2006-03-08 19:27:19 +0900}, Institution = {Cornell University, Computer Science Department}, Keywords = {database, distributed computing, consistency}, Language = {English}, Number = {TR91-1240}, Title = {Maintaining Consistency in Distributed Systems}, Type = {Technical Report}, Year = {1991}} @article{Birman93, Author = {Birman, K.~P.}, Date-Modified = {2006-03-08 19:02:28 +0900}, Journal = {Communications of the ACM}, Keywords = {broadcast, distributed computing}, Language = {English}, Number = {12}, Pages = {37--53}, Title = {The process group approach to reliable distributed computing}, Url = {http://www.acm.org/pubs/toc/Abstracts/0001-0782/163303.html}, Volume = {36}, Year = {1993}} @book{Birman94, Author = {Birman, K.~P. and {van Renesse}, R.}, Date-Modified = {2006-03-08 19:26:49 +0900}, Keywords = {group communication, toolkit}, Language = {English}, Publisher = {{IEEE} Press}, Title = {Reliable Distributed Computing with the {I}{SIS} Toolkit}, Year = {1994}} @phdthesis{Boichat01, Address = {Switzerland}, Author = {Boichat, R.}, Date-Modified = {2006-03-08 19:25:41 +0900}, Keywords = {reliable broadcast, atomic broadcast}, Language = {English}, Number = {2472}, School = {\'{E}cole Polytechnique F\'{e}d\'{e}rale Lausanne}, Title = {Reliable and Total Order Broadcast in the Crash Recovery Model}, Year = {2001}} @inproceedings{Borr84, Address = {Singapore}, Author = {Borr, A.}, Booktitle = {Proceedings of $10^{th}$ VLDB Conference}, Date-Modified = {2006-03-08 19:27:39 +0900}, Keywords = {database, fault-tolerance}, Language = {English}, Title = {Robustness to Crash in a Distributed Database: {A} Non Shared-Memory Multi-Processor Approach}, Year = {1984}} @article{Breton98, Author = {Breton, R.}, Author-Address = {Sybase Replication Server Engineering}, Date-Modified = {2006-03-08 19:34:20 +0900}, Journal = {Bulletin of the Technical Commitee on Data Engineering}, Keywords = {replication}, Language = {English}, Number = {4}, Pages = {38--43}, Title = {Replication Strategies for High Availability and Disaster Recovery}, Volume = {21}, Year = {1998}} @article{CASD85, Address = {Ann Arbor, Michigan}, Author = {Cristian, F. and Aghili, H. and Strong, R. and Doley, D.}, Date-Modified = {2006-03-08 19:03:29 +0900}, Journal = {Proceedings of the $15^{th}$ Internationnal Conference on Fault-Tolerant Computing}, Keywords = {atomic broadcast}, Language = {English}, Pages = {1--12}, Title = {Atomic Broadcast: From Simple Message Diffusion to Byzantine Agreement}, Url = {http://citeseer.nj.nec.com/cristian85atomic.html}, Year = {1985}} @book{CDK00, Author = {Coulouris, G. and Dollimore, J. and Kindberg, T.}, Date-Modified = {2006-03-08 19:32:59 +0900}, Edition = {Third}, Isbn = {0-201-61918-0}, Keywords = {distributed computing}, Language = {English}, Publisher = {Addison-Wesley}, Title = {Distributed Systems: Concepts and Design}, Url = {http://www.cdk3.net/}, Year = {2000}} @inproceedings{CDS01, Address = {Rome, Italy}, Author = {Charron-Bost, B. and D{\'e}fago, X. and Schiper, A.}, Booktitle = {Proceedings of the $6^{th}$ International Workshop on Object-oriented Real-time Dependable Systems (WORDS'01)}, Date-Modified = {2006-05-17 17:02:31 +0900}, Isbn = {0-7695-1068-X}, Keywords = {fault-tolerance, synchrony}, Language = {English}, Organization = {IEEE}, Pages = {21}, Publisher = {IEEE Computer Society}, Title = {Time vs. Space in Fault-Tolerant Distributed Systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/WORDS.2001.945109}, Year = {2001}, Abstract = {Algorithms for solving agreement problems can be classified in two categories: (1) those relying on failure detectors (FDs), which we call FD-based, and (2) those that rely on a group membership service (GMS), which we call GMS-based. This paper discusses the advantages and limitations of these two approaches and proposes an extension to the GMS approach that combines the advantages of both approaches, without their drawbacks. This extension leads us to distinguish between time-triggered suspicions of processes and space-triggered exclusions}} @article{CF99, Author = {Cristian, F. and Fetzer, C.}, Date-Modified = {2006-03-08 19:14:57 +0900}, Journal = {{IEEE} Transactions on Parallel and Distributed Systems}, Keywords = {time model}, Language = {English}, Number = {6}, Title = {The Timed Asynchronous Distributed System Model}, Url = {http://www-cse.ucsd.edu/users/cfetzer/MODEL/}, Volume = {10}, Year = {1999}} @techreport{CHKS92, Author = {Ceri, S. and Houtsma, M.~A.~W. and Keller, A.~M. and Samarati, P.}, Institution = {Stanford University, Department of Computer Science, USA}, Language = {English}, Number = {CS-TR-92-1446}, Pages = {15}, Title = {Independent updates and incremental agreement in replicated databases}, Year = {1992}} @inproceedings{CHKS92b, Address = {Lorne, Victoria, Australia}, Author = {Ceri, S. and Houtsma, M. and Keller, A. and Samarati, P.}, Booktitle = {{IFIP} Semantics of Interoperable Database System Management Science,}, Editor = {of {TIMS}, Journal}, Language = {English}, Title = {Achieving Incremental Consistency among Autonomous Replicated Databases}, Year = {1992}} @techreport{CHKS94, Author = {Ceri, S. and Houtsma, M. and Keller, A. and Samarati, P.}, Institution = {Stanford University, Computer Science Departement}, Keywords = {Database replication, Classification}, Language = {English}, Number = {CS-TR-91-1392}, Title = {A Classification of Update Methods for Replicated Databases}, Url = {http://www-db.stanford.edu/pub/keller/1991/fauve-survey.ps}, Year = {1994}} @article{CHT96, Author = {Chandra, T.~D. and Hadzilacos, V. and Toueg, S.}, Date-Modified = {2006-03-08 19:03:38 +0900}, Journal = {Journal of the ACM}, Keywords = {consensus, failure detection}, Language = {English}, Number = {4}, Pages = {685--722}, Title = {The weakest failure detector for solving consensus}, Url = {http://portal.acm.org/citation.cfm?doid=234533.234549}, Volume = {43}, Year = {1996}} @inbook{CJ97, Author = {Chow, R. and Johnson, T.}, Chapter = {12 - Replicated Data Management}, Date-Modified = {2006-03-08 19:34:09 +0900}, Institution = {University of Florida}, Isbn = {0-201-49838-3}, Keywords = {replication, database}, Language = {English}, Publisher = {Addisson-Wesley}, Title = {Distributed Systems & Algorithms}, Year = {1997}} @inproceedings{CK00, Abstract-Url = {http://www.cs.umd.edu/~keleher/abstracts/srds00.html}, Address = {N{\"{u}}rnberg, Germany}, Author = {\Cetintemel, U. and Keleher, P.}, Booktitle = {Proceedings of the $19^{th}$ Symposium on Reliable Distributed Systems {SRDS'2000}}, Isbn = {0-7695-0543-0}, Issn = {1060-9857}, Keywords = {DENO, lazy replication}, Language = {English}, Organization = {{IEEE} Computer Society}, Pages = {218--227}, Publisher = {{IEEE} Computer Society, Los Alamitos, California}, Title = {Performance of Mobile, Single Object, Replication Protocols}, Url = {http://www.cs.umd.edu/~ugur/Papers/srds00.pdf}, Year = {2000}, Annote = {Epidemic protocol for Generic Broadcast}} @techreport{CL88, Address = {Madison, WI 53706 {USA}}, Author = {Carey, M.~J. and Livny, M.}, Institution = {Computer Science Departement University of Wisconsin}, Keywords = {database replication, simulation}, Language = {English}, Number = {758}, Title = {Distributed Concurency Control Performance: {A} Study of Algorithms, Distribution and Replication}, Year = {1998}} @article{CMA97, Author = {Cristian, F. and Mishra, S. and Alvarez, G.}, Date-Modified = {2005-11-02 15:28:09 +0900}, Institution = {{IEEE} and British Computer Society}, Journal = {Distributed System Engineering Journal}, Keywords = {group communication}, Language = {English}, Number = {2}, Pages = {109--128}, Title = {High-Performance Asynchronous Atomic Broadcast}, Volume = {4}, Year = {1997}} @techreport{CP92, Address = {New York, {NY} 10027}, Author = {Chen, S.~W. and Pu, C.}, Institution = {Columbia University, Departement of Computer Science}, Keywords = {Database Replication, Classification}, Language = {English}, Number = {CUCS-006-92}, Title = {A Structural Classification of Integrated Replica Control Mechanisms}, Url = {http://citeseer.nj.nec.com/chen92structural.html}, Year = {1992}} @article{CR94, Affiliation = {Pittsburgh Univ., PA, USA}, Author = {Chrysanthis, P. K. and Ramamritham, K.}, Date-Modified = {2006-06-26 17:07:31 +0900}, Issn = {0362-5915}, Keywords = {transaction processing}, Number = {3}, Pages = {450--491}, Title = {Synthesis of Extended Transaction Models Using {ACTA}}, Volume = {19}, Year = {1994}} @inproceedings{CS93, Alternate-Key = {Cheriton93}, Author = {Cheriton, D.~R. and Skeen, D.}, Booktitle = {Proc. of the 14$^{th}$ Symp. on Operating Systems Principles}, Date-Modified = {2006-10-16 18:25:42 +0900}, Editor = {Liskov, Barbara}, Isbn = {0-89791-632-8}, Keywords = {group communication, Causal Order}, Language = {English}, Location = {Asheville, North Carolina, United States}, Pages = {44--57}, Publisher = {ACM}, Title = {Understanding the Limitations of Causally and Totally Ordered Communication}, Url = {http://doi.acm.org/10.1145/168619.168623}, Year = {1993}, Abstract = {Causally and totally ordered communication support (CATOCS) has been proposed as important to provide as part of the basic building blocks for constructing reliable distributed systems. In this paper, we identify four major limitations to CATOCS, investigate the applicability of CATOCS to several classes of distributed applications in light of these limitations, and the potential impact of these facilities on communication scalability and robustness. From this investigation, we find limited merit and several potential problems in using CATOCS. The fundamental difficulty with the CATOCS is that it attempts to solve problems at the communication level in violation of the well-known end-to-end argument.}} @inbook{CSA98, Author = {Chysanthis, P.~K. and Samaras, G. and Y.~J.~Al-Houmaily}, Chapter = {Recovery and Performance of Atomic Commit Processing in Distributed Database Systems}, Date-Modified = {2006-04-11 10:36:35 +0900}, Edition = {First}, Isbn = {0-13-614215-X}, Keywords = {atomic commitment}, Language = {English}, Opteditor = {V. Kumar and M. Hsu}, Publisher = {Prentice-Hall}, Title = {Recovery Mechanisms in Database Systems}, Url = {http://citeseer.nj.nec.com/chrysanthis98recovery.html}, Year = {1998}} @manual{CSIM18, Address = {3925 West Braker Lane Austin Texas TX-78759-5321, USA}, Keywords = {discrete simulation engine}, Language = {English}, Organization = {Mesquite Software Inc.}, Organization-Url = {mailto:info@mesquite.com}, Title = {{CSIM18} simulation engine ({C}++ version)}, Year = {1994}} @techreport{CT92, Abstract-Url = {ftp://ftp.cs.umass.edu/pub/techrept/ABSTRACT/UM-CS-1992-067.ABS}, Address = {Amherst, MA 01003 {USA}}, Author = {Chen, S. and Towsley, D}, Date-Modified = {2006-03-08 19:33:50 +0900}, Institution = {Departement of Computer Science, University of Masschusetts}, Keywords = {replication, hardware}, Language = {English}, Number = {UM-CS-1992-067}, Title = {{A} Performance Evaluation of {RAID} Architectures}, Url = {ftp://ftp.cs.umass.edu/pub/techrept/techreport/1992/UM-CS-1992-067.ps}, Year = {1992}} @article{CT96, Author = {Chandra, T.~D. and Toueg, S.}, Date-Modified = {2006-09-15 15:14:23 +0900}, Issn = {0004-5411}, Journal = {Journal of the ACM}, Keywords = {consensus, failure detection}, Language = {English}, Number = {2}, Pages = {225--267}, Title = {Unreliable failure detectors for reliable distributed systems}, Url = {http://doi.acm.org/10.1145/226643.226647}, Volume = {43}, Year = {1996}, Abstract = {We introduce the concept of unreliable failure detectors and study how they can be used to solve Consensus in asynchronous systems with crash failures. We characterise unreliable failure detectors in terms of two properties---completeness and accuracy. We show that Consensus can be solved even with unreliable failure detectors that make an infinite number of mistakes, and determine which ones can be used to solve Consensus despite any number of crashes, and which ones require a majority of correct processes. We prove that Consensus and Atomic Broadcast are reducible to each other in asynchronous systems with crash failures; thus, the above results also apply to Atomic Broadcast. A companion paper shows that one of the failure detectors introduced here is the weakest failure detector for solving Consensus [Chandra et al. 1992].}} @inproceedings{CW86, Address = {Kyoto}, Author = {Chou, H. and Won, K.}, Booktitle = {Proceedings of the 12th {VLDB} Conference}, Language = {English}, Title = {A Unifying Framework for Version Control in a {CAD} Environment}, Year = {1986}} @article{CZ85, Anote = {First desrciption of group communications?}, Author = {Cheriton, D.~R. and Zwaenepoel, W.}, Date-Modified = {2006-03-08 19:06:17 +0900}, Journal = {ACM Transactions on Computer Systems (TOCS)}, Keywords = {distributed computing, group communication}, Language = {English}, Number = {2}, Pages = {77--107}, Title = {Distributed process groups in the {V} Kernel}, Url = {http://doi.acm.org/10.1145/214438.214439}, Volume = {3}, Year = {1985}} @techreport{Cap90, Address = {190, Winterthurstra{\ss}e CH-8057 Z{\"{u}}rich, Switzerland}, Author = {Cap, C.~H.}, Date-Modified = {2006-03-08 19:36:48 +0900}, Institution = {Department of Computer Science, University of Z{\"{u}}rich}, Keywords = {database, replication}, Language = {English}, Number = {ifi-90.11}, Title = {Distributed Systems with Data Replication: {A} Non-Technical Survey}, Url = {ftp://ftp.ifi.unizh.ch/pub/techreports/TR-90/ifi-90.11.ps.gz}, Year = {1990}} @article{Chandy83, Author = {Chandy, K. Mani and Haas, L.~M. and Misra, J.}, Date-Modified = {2006-03-08 15:25:02 +0900}, Keywords = {Deadlock}, Language = {English}, Number = {2}, Pages = {144--156}, Title = {Distributed Deadlock Detection}, Volume = {1}, Year = {1983}, Annote = {Distributed deadlock models are presented for resource{\newline} and communication deadlocks. Simple distributed{\newline} algorithms for detection of these deadlocks are given.{\newline} We show that all true deadlocks are detected and that{\newline} no false deadlocks are reported. In our algorithms, no{\newline} process maintains global information; all messages have{\newline} an identical short length. The algorithms can be{\newline} applied in distributed database and other message{\newline} communication systems.}} @inproceedings{Chang84, Address = {Boston, Massachusetts {USA}}, Affiliation = {AT\&T Bell Lab, Murray Hill, NJ 07974, {USA}}, Author = {Chang, J.~M.}, Booktitle = {{SIGMOD}'84, Proceedings of Annual Meeting}, Date-Modified = {2006-04-11 10:26:58 +0900}, Editor = {Yormark, Beatrice}, Isbn = {0-89791-128-8}, Issn = {0163-5808}, Keywords = {database replication, atomic broadcast}, Language = {English}, Month = {18--21~}, Organization = {{ACM}, Special Interest Group on Management of Data, New York, {NY}, {USA}}, Pages = {223--233}, Title = {Simplifying Distributed Database Systems Design by Using a Broadcast Network}, Volume = {14}, Year = {1984}} @manual{DB2:2vs3tier, Address = {{IBM} Santa Teresa Laboratory, San Jose, California {USA}}, Author = {Munk, T.}, Date-Modified = {2006-08-01 14:12:08 +0900}, Keywords = {transaction processing, commercial}, Language = {English}, Organization = {{IBM}}, Title = {{DB2} Familly: Client/Server Performance Measurement Series}, Url = {http://www.software.ibm.com/data/db2/performance/2vs3tier.pdf}, Year = {1998}, Abstract = {This technical report compares the performance{\newline} characteristics of various distributed programming{\newline} techniques. These techniques are static vs dynamic vs{\newline} stored procedures.}} @manual{DB2:Replication, Address = {New Orchard Road, Armonk, NY 10504 ({USA})}, Date-Modified = {2006-04-11 10:39:49 +0900}, Keywords = {database}, Language = {English}, Note = {Number SC26-9642-00}, Number = {SC26-9642-00}, Organization = {{IBM}}, Title = {{DB2}: Replication Guide and Reference}, Url = {ftp://ftp.software.ibm.com/ps/products/db2/info/vr6/pdf/a4/db2e0e60.pdf}, Year = {1999}} @techreport{DB2:Version7, Author = {Bergman, R. and Tsounis, C.}, Date-Modified = {2006-04-11 10:37:39 +0900}, Institution = {{IBM} Corporation}, Keywords = {database}, Language = {English}, Title = {{DB2} Univeral Data Base Version 7 Features and Facilities}, Type = {White Paper}, Url = {http://www-4.ibm.com/software/data/pubs/papers/#db2udbv7}, Year = {2000}} @article{DGS85, Author = {Davidson, S. B. and Garcia-Molina, H. and Skeen, D.}, Date-Modified = {2006-03-08 19:04:25 +0900}, Journal = {{ACM} Computing Surveys}, Keywords = {consistency, distributed computing}, Language = {English}, Number = {3}, Pages = {341--370}, Title = {Consistency in Partitioned Networks}, Url = {http://portal.acm.org/citation.cfm?id=5508&coll=ACM&dl=ACM&CFID=11016369&CFTOKEN=31866788}, Volume = {17}, Year = {1985}} @article{DLS88, Address = {New York, NY}, Author = {Dwork, C. and Lynch, N. and Stockmeyer, L.}, Date-Modified = {2006-03-08 19:03:52 +0900}, Keywords = {consensus}, Language = {English}, Number = {2}, Pages = {288--323}, Publisher = {{ACM} Press}, Title = {Consensus in the Presence of Partial Synchrony}, Volume = {35}, Year = {1988}} @article{DM96, Alternate-Key = {Dolev96}, Author = {Dolev, D. and Malki, D.}, Date-Modified = {2006-03-08 19:07:27 +0900}, Journal = {Communications of the {ACM}}, Keywords = {group communication, toolkit}, Language = {English}, Number = {4}, Pages = {64--70}, Title = {The {Transis} Approach to High Availability Cluster Communication}, Url = {http://www.acm.org/pubs/articles/journals/cacm/1996-39-4/p64-dolev/p64-dolev.pdf}, Volume = {39}, Year = {1996}} @manual{DRAGON, Date-Modified = {2006-08-01 14:03:02 +0900}, Key = {DRAGON}, Keywords = {Dragon, database replication}, Language = {English}, Note = {http://www.inf.ethz.ch/department\linebreak[1]/IS/iks/research/dragon.html}, Organization = {Information \& Communcations Systems Research Group, {ETH} Z{\"u}rich and Laboratoire de Syst\`emes d'Exploitation ({LSE}), {EPF} Lausanne}, Title = {{DRAGON}: Database Replication Based on Group Communication}, Url = {http://www.inf.ethz.ch/department/IS/iks/research/dragon.html}, Year = {1998}} @inproceedings{DSS98, Abstract-Url = {http://lsewww.epfl.ch/Documents/abstract/DSS98.txt}, Address = {West Lafayette, IN, USA}, Author = {D{\'e}fago, X. and Schiper, A. and Sergent, N.}, Booktitle = {Proceedings of the $17^{th}$ Symposium on Reliable Distributed Systems ({SRDS})}, Date-Modified = {2006-05-17 16:16:10 +0900}, Isbn = {0-8186-9218-9}, Keywords = {Replication}, Language = {English}, Month = {October}, Organization = {IEEE}, Pages = {43--50}, Title = {Semi-Passive Replication}, Url = {http://dlib.computer.org/conferen/srds/9218/pdf/92180043.pdf}, Year = {1998}, Abstract = {This paper presents the semi-passive replication technique, a variant of passive replication, that can be implemented in the asynchronous system model without requiring a membership service to agree on a primary. Passive replication is a popular replication technique since it can tolerate non-deterministic servers (e.g., multi-threaded servers) and uses little processing power when compared to other replication techniques. However, passive replication suffers from a high reconfiguration cost in case of the failure of the primary. The semi-passive replication technique presented in the paper benefits from the same advantages as passive replication. However, since it does not require a group membership service, semi-passive replication has a considerably lower cost in case of failure. As explained in the paper, this technique can benefit from an aggressive time-out value significantly lower than what a group membership allows. As a result, the reaction to crashes is greatly improved. The semi-passive replication algorithm uses failure detectors. The algorithm given in the paper is analysed in the failure free case and in the case of one server crash. The response time (for the client) of these two scenarios is analysed through simulation}} @inproceedings{DST87, Address = {San Francisco, CA}, Author = {Daniels, D. S. and Spector, A. Z. and Thompson, D. S.}, Booktitle = {Proceedings of the {ACM} {SIGMOD} Annual Conference}, Date-Modified = {2006-03-08 19:38:07 +0900}, Editor = {Dayal, U. and Traiger, I.}, Keywords = {transaction processing}, Language = {English}, Organization = {{ACM}}, Pages = {82--96}, Publisher = {{ACM} Press}, Title = {Distributed Logging for Transaction Processing}, Year = {1987}} @techreport{DSU00, Address = {Switzerland}, Author = {D{\'e}fago, X. and Schiper, A. and Urb{\'a}n, P.}, Date-Modified = {2006-05-17 16:50:22 +0900}, Institution = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Keywords = {atomic broadcast}, Language = {English}, Number = {DSC/2000/036}, Title = {Totally Ordered Broadcast and Multicast Algorithms: {A} Comprehensive Survey}, Url = {http://ddg.jaist.ac.jp/pub/DSU00.pdf}, Year = {2000}, Abstract = {Total order multicast algorithms constitute an important class of problems in distributed systems, especially in the context of fault-tolerance. In short, the problem of total order multicast consists in sending messages to a set of processes, in such a way that all messages are delivered by all correct destinations in the same order. However, the huge amount of literature on the subject and the plethora of solutions proposed so far make it difficult for practitioners to select a solution adapted to their speci ic problem. As a result, naive solutions are often used while better solutions are ignored. This paper proposes a classification of total order multicast algorithms based on the ordering mechanism of the algorithms, and describes a set of common characteristics (e.g., assumptions, properties) with which to evaluate them. In this classification, more than fifty total order broadcast and multicast algorithms are sur- veyed. The presentation includes asynchronous algorithms as well as algorithms based on the more restrictive synchronous model. Fault-tolerance issues are also considered as the paper studies the properties and behavior of the different algorithms with respect to failures. }} @inproceedings{DWAP94, Address = {Monterey, California}, Author = {Dahlin, M. D. and Wang, R. Y. and Anderson, T. E. and Patterson, D. A.}, Booktitle = {Proceedings of the First Symposium on Operating Systems Design and Implementation}, Language = {English}, Note = {Also appeared as University of California Technical Report CSD-94-844.}, Organization = {USENIX Association}, Pages = {267--280}, Title = {Cooperative Caching: Using Remote Client Memory to Improve File System Performance}, Url = {http://www.cs.princeton.edu/~rywang/berkeley/papers/osdi94.ps}, Year = {1994}} @phdthesis{Def00, Address = {Switzerland}, Author = {D{\'e}fago, X.}, Date-Modified = {2006-05-17 16:51:56 +0900}, Keywords = {replication, consensus, atomic broadcast}, Language = {English}, Number = {2229}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Agreement-Related Problems: From Semi-Passive Replication to Totally Ordered Broadcast}, Url = {http://infoscience.epfl.ch/search.py?recid=49980}, Year = {2000}} @unpublished{Defago97, Author = {D{\'e}fago, X.}, Date-Modified = {2006-03-08 19:32:21 +0900}, Keywords = {CORBA}, Language = {English}, Note = {Internal Report}, Title = {{CORBA} Performance Tool -- State Transfer}, Year = {1997}} @inproceedings{Demers87, Address = {Vancouver, BC, Canada}, Author = {Demers, A. and others}, Booktitle = {Proceedings of the $6^{th}$ Annual {ACM} Symposium on Principles of Distributed Computing}, Date-Modified = {2006-04-11 10:26:13 +0900}, Editor = {Schneider, Fred B.}, Isbn = {0-89871-239-4}, Keywords = {database replication, epidemic algorithm}, Language = {English}, Pages = {1--12}, Publisher = {ACM Press}, Title = {Epidemic Algorithms for Replicated Database Maintenance}, Year = {1987}} @techreport{EAWJ96, Address = {Pittsburgh, PA, {USA}}, Author = {Elnozahy, M. and Alvisi, L. and Wang, Y.~M. and Johnson, D.~B.}, Date-Modified = {2006-05-17 17:03:41 +0900}, Institution = {School of Computer Science, Carnegie Mellon University}, Keywords = {message passing, message logging}, Language = {English}, Number = {CMU-CS-96-181}, Title = {A survey of rollback-recovery protocols in message passing systems}, Url = {http://citeseer.nj.nec.com/elnozahy96survey.html}, Year = {1996}} @article{Eswaran76, Author = {Eswaran, K. P. and Gray, J. N. and Lorie, R. A. and Traiger, I. L.}, Date-Modified = {2005-11-02 15:17:29 +0900}, Issn = {0001-0782}, Journal = {Communications of the {ACM}}, Keywords = {2PC}, Language = {English}, Note = {Also published in/as: IBM Research Report RJ1487, San Jose, CA, December, 1974.}, Number = {11}, Pages = {624--633}, Title = {The notions of consistency and predicate locks in a database system}, Volume = {19}, Year = {1976}, Annote = {If transactions are well-formed (they obtain and{\newline} release all locks required to avoid conflicts) and{\newline} 2-phase (they don't obtain locks after having released{\newline} one), they maintain consistency. Original 2-phase{\newline} protocol. Serializability? Definitions of transactions,{\newline} permissible locking sequence in transactions, the{\newline} importance and complexity of predicate locks. Two-phase{\newline} transactions.}} @inproceedings{FC92, Address = {Edmonton (Canada)}, Author = {Franklin, M. J and Carey, M. J.}, Booktitle = {Proceedings of the Internationnal Workshop on Distributed Object Management}, Language = {English}, Pages = {252--274}, Title = {Client-Server Caching Revisited}, Year = {1992}} @inproceedings{FC94, Address = {Santiago, Chile}, Author = {Fu, A. W. and Cheung, D. W.}, Booktitle = {Proceedings of the International Conference on Very Large Databases}, Date-Modified = {2006-08-01 13:59:18 +0900}, Keywords = {database replication, transaction processing}, Language = {English}, Title = {A Transaction Replication Scheme for a Replicated Database with Node Autonomy}, Year = {1994}} @inbook{FC98, Author = {Franklin, M~.J and Carey, M.~J.}, Chapter = {Client-Server Caching Revisisted}, Edition = {Third}, Editor = {Stonebraker and Hellerstien}, Language = {English}, Publisher = {Morgan Kaufmann}, Title = {Readings in Database Systems}, Url = {http://www.cs.umd.edu/users/franklin/papers/iwdom.ps.gz}, Year = {1998}} @techreport{FG99, Author = {Fr{\o}lund, S. and Guerraoui, R.}, Date-Modified = {2005-11-02 15:34:33 +0900}, Institution = {HP Labs}, Keywords = {3-tier}, Language = {English}, Number = {HPL-1999-105}, Title = {Exactly Once Transactions}, Url = {http://www.hpl.hp.com/techreports/1999/HPL-1999-105.html}, Year = {1999}, Abstract = {A three-tier application is organized as three layers:{\newline} human users interact with front-end clients (e.g.{\newline} browsers); middle-tier application servers (e.g. Web{\newline} servers) contain the application's business logic, and{\newline} perform transactions against back-end databases.{\newline} Although three-tier applications are becoming{\newline} mainstream, they usually fail to provide sufficient{\newline} reliability guarantees to the users. Usually,{\newline} replication and transaction-processing techniques are{\newline} applied to specific parts of the application, but their{\newline} combination does not provide end-to-end reliability.{\newline} The aim of this paper is to precisely define a{\newline} desirable, yet realistic, specification of end-to-end{\newline} reliability in three-tier applications. We present the{\newline} specification in the form of a problem, called{\newline} Exactly-Once Transactions, which encompasses both{\newline} safety and liveness properties in such environments. We{\newline} also describe a practical protocol that solves the{\newline} problem and we discuss its implementation and{\newline} performance in a practical setting.}} @article{FLP85, Author = {Fischer, M. H. and Lynch, N. A. and Paterson, M. S.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Journal = {Journal of the {ACM}}, Keywords = {operating system commit processing, distributed systems}, Language = {English}, Number = {2}, Pages = {374--382}, Title = {Impossibility of Consensus with One Faulty Process}, Url = {http://www.acm.org/pubs/articles/journals/jacm/1985-32-2/p374-fischer/p374-fischer.pdf}, Volume = {32}, Year = {1985}} @inproceedings{FP01, Address = {New Orleans, LA, USA}, Author = {Fr{\o}lund, S. and Pedone, F.}, Booktitle = {Proceeding of the $20^{th}$ Symposium on Reliable Distributed Systems}, Date-Modified = {2006-04-11 10:28:01 +0900}, Isbn = {0-7695-1366-2}, Issn = {1060-9857}, Keywords = {database replication}, Language = {English}, Pages = {46--55}, Publisher = {IEEE Computer Society, Los Alamitos, California, USA}, Title = {Continental Pronto}, Year = {2001}} @techreport{FR95, Alternate-Key = {Friedman95}, Author = {Friedman, R. and {van Renesse}, R.}, Institution = {Cornell University, Computer Science Department}, Language = {English}, Number = {TR95-1527}, Pages = {18}, Title = {Packing Messages as a Tool for Boosting the Performance of Total Ordering Protocols}, Type = {Technical Report}, Year = {1995}} @article{FRT92, Author = {Franaszek, P. A. and Robinson, J. T. and Thomasian, A.}, Date-Modified = {2006-03-08 15:24:17 +0900}, Journal = {{ACM} Transactions on Database Systems}, Keywords = {database, concurency control, transaction processing, simulation}, Language = {English}, Number = {2}, Pages = {304--345}, Title = {Concurency Control for High Contention Environnement}, Volume = {17}, Year = {1992}} @phdthesis{Fel98, Address = {Switzerland}, Author = {Felber, P.}, Date-Modified = {2006-05-17 16:39:30 +0900}, Keywords = {CORBA, group communication, toolkit, service}, Language = {English}, Number = {1867}, School = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Title = {The {CORBA} Object Group Service: {A} Service Approach to Object Groups in {CORBA}}, Url = {http://lsewww.epfl.ch/Documents/acrobat/Fel98.pdf}, Year = {1998}} @inproceedings{Fetzer99, Address = {Madeira Island, Portugal}, Author = {Fetzer, C.}, Booktitle = {Proceedings of the Third European Research Seminar on Advances in Distributed Systems (ERSADS'99)}, Date-Modified = {2006-04-11 10:27:40 +0900}, Keywords = {time model}, Language = {English}, Pages = {109--118}, Title = {A comparison of timed asynchronous systems and asynchronous systems with failure detectors}, Url = {http://www.research.att.com/~christof/TMR/}, Year = {1999}} @inproceedings{GA87, Author = {Gray, J. N. and Anderton, M.}, Booktitle = {Proceedings of the IEEE, Special Issue on Distributed Databases}, Date-Modified = {2006-08-01 14:22:25 +0900}, Keywords = {distributed systems, Fault statistics}, Language = {English}, Note = {Also Tandem Technical Report 85.5}, Number = {5}, Pages = {719--726}, Publisher = {IEEE}, Title = {Distributed Computer Systems: Four Case Studies}, Volume = {75}, Year = {1987}} @inproceedings{GAGGSVV99, Address = {Toulouse (France)}, Author = {Ghose, K. and Aggarwal, S. and Ghosh, A. and Goldman, D. and Sulatycke, P. and Vask, P. and Voguel, D. R}, Booktitle = {Proceeding of {E}uro-{P}ar'99 - Parallel Processing}, Editor = {Amestoy, P. and Berger, P. and Dayd\'{e}, M. and Duff, I. and Frayss\'{e}e, V. and Giraud, L. and Ruiz, D.}, Isbn = {3-540-66443-2}, Issn = {0302--9743}, Language = {English}, Organization = {{ACM} - {IFIR}}, Pages = {505--510}, Publisher = {Springer-Verlag}, Series = {Lecture Notes in Computer Science}, Title = {Designing Multiprocessor/Distributed Real-Time Systems Using the {ASSERT} Toolkit}, Volume = {1685}, Year = {1999}} @book{GBD+94, Author = {Geist, A. and Beguelin, A. and Dongarra, J. and Jiang, W. and Machek, R. and Sunderam, V.}, Date-Modified = {2006-08-01 14:18:30 +0900}, Editor = {Kowalik, J.}, Keywords = {toolkit, message passing}, Language = {English}, Publisher = {{MIT} Press Scientific and Engineering Computation}, Title = {{PVM}: Parallel Virtual Machine {A} Users' Guide and Tutorial for Networked Parallel Computing}, Url = {http://www.netlib.org/pvm3/book/pvm-book.html}, Year = {1994}} @incollection{GHM+00, Author = {Guerraoui, R. and Hurfin, M. and Mostefaoui, A. and Oliveira, R. and Raynal, M. and Schiper, A.}, Booktitle = {Advances in Distributed Systems}, Date-Modified = {2006-09-15 15:19:15 +0900}, Isbn = {3-540-67196}, Issn = {0302-9743}, Keywords = {consensus}, Language = {English}, Pages = {33--47}, Publisher = {Spinger}, Series = {LNCS}, Title = {Consensus in Asynchronous Distributed Systems: {A} Concise Guided Tour}, Url = {http://link.springer.de/link/service/series/0558/tocs/t1752.htm}, Volume = {1752}, Year = {2000}} @inproceedings{GHOS96, Abstract-Url = {http://www.research.microsoft.com/scripts/pubdb/pubsasp.asp?RecordID=139}, Address = {Montreal, Canada}, Alternate-Key = {Gray96}, Author = {Gray, J.~N. and Helland, P. and O'Neil, P. and Shasha, D.}, Booktitle = {Proceedings of the 1996 International Conference on Management of Data}, Date-Modified = {2006-04-11 10:39:04 +0900}, Keywords = {database, Database replication}, Language = {English}, Pages = {173--182}, Publisher = {ACM-SIGMOD}, Title = {The Dangers of Replication and a Solution}, Url = {ftp://ftp.research.microsoft.com/pub/tr/tr-96-17.doc}, Year = {1996}} @inproceedings{GHR97, Address = {New York}, Author = {Gupta, R. and Haritsa, J. and Ramamritham, K.}, Booktitle = {Proceedings of the {ACM} {SIGMOD} International Conference on Management of Data}, Issn = {0163-5808}, Language = {English}, Number = {2}, Pages = {486--497}, Publisher = {ACM Press}, Series = {SIGMOD Record}, Title = {Revisiting Commit Processing in Distributed Database Systems}, Volume = {26}, Year = {1997}} @article{GJN99, Address = {RWTH Aachen, Informatik V, Ahornstr. 55 D-52056 Aachen, Germany}, Author = {Gallersd{\"{o}}rfer, R. and Jarke, M. and Nicola, M.}, Date-Modified = {2006-03-08 15:30:31 +0900}, Journal = {International Journal of Cooperative Information Systems (IJCS)}, Keywords = {database, transaction processing, lazy replication, performance}, Language = {English}, Number = {1}, Pages = {14--45}, Title = {The {ADR} Replication Manager}, Url = {http://www-i5.informatik.rwth-aachen.de/~nicola/IJCIS99.ps}, Volume = {8}, Year = {1999}} @inproceedings{GMP90, Address = {Brisbane, Australia}, Altkey = {Epoch90}, Author = {Garcia-Molina, H. and Polyzois, C. A.}, Booktitle = {Proceedings of $16^{th}$ VLDB Conference}, Language = {English}, Pages = {222--230}, Title = {Two Epoch Algorithms for Disaster Recovery}, Year = {1990}} @inproceedings{GN95, Author = {Gallersd{\"o}rfer, R. and Nicola, M.}, Booktitle = {VLDB'95, Proceedings of 21th International Conference on Very Large Data Bases, September 11-15, 1995, Z{\"{u}}rich, Switzerland}, Editor = {Dayal, Umeshwar and Gray, Peter M. D. and Nishio, Shojiro}, Isbn = {1-55860-379-4}, Language = {English}, Pages = {445--456}, Publisher = {Morgan Kaufmann}, Title = {Improving Performance in Replicated Databases through Relaxed Coherency}, Url = {http://www-i5.informatik.rwth-aachen.de/~nicola/VLDB95.ps}, Year = {1995}} @inproceedings{GNS+00, Address = {Antwerp, Belgium}, Author = {Gokhale, A. and Natarajan, B. and Schmidt, D.~C. and Yajnik, S.}, Booktitle = {Proceedings of the $2^{nd}$ International Symposium on Distributed Objects and Applications (DOA '00)}, Date-Modified = {2006-03-08 19:17:57 +0900}, Keywords = {group communication, CORBA, toolkit}, Language = {English}, Organization = {Object Management Group}, Title = {{DOORS}: Towards High-performance Fault-Tolerant {CORBA}}, Url = {http://www.cs.wustl.edu/~schmidt/PDF/DOA-2000.pdf}, Year = {2000}} @techreport{GOR94, Address = {Atlanta, Georgia, 30332-03280 USA}, Author = {Gukal, S. and Omiecinski, E. and Ramachandran, U.}, Institution = {Georgia Institute of Technology. College of Computing}, Language = {English}, Number = {GIT-CC-94-13}, Title = {Avoiding Conflicts between Reads and Writes Using Dynamic Versioning}, Url = {ftp://ftp.cc.gatech.edu/pub/coc/tech_reports/1994/GIT-CC-94-13.ps.Z}, Year = {1994}} @techreport{GOS98, Address = {Switzerland}, Author = {Guerraoui, R. and Oliveira, R. and Schiper, A.}, Date-Modified = {2006-05-18 14:26:23 +0900}, Institution = {{\'E}cole Polytechnique F{\'e}d{\'e}rale Lausanne}, Language = {English}, Number = {98/272}, Title = {Stubborn communication channels}, Url = {http://lsewww.epfl.ch/Publications/ById/165.html}, Year = {1998}} @book{GR93, Address = {San Mateo (CA), {USA}}, Author = {Gray, J. N. and Reuter, A.}, Isbn = {1-55860-190-2}, Language = {English}, Publisher = {Morgan Kaufmann Publishers, Inc.}, Series = {Data Management Systems}, Title = {Transaction Processing: concepts and techniques}, Year = {1993}} @techreport{GS95, Address = {Departement of Computing Science, University of Newcastle-upon-Tyne NE1 7RU UK}, Author = {Guerraoui, R. and Schiper, A.}, Institution = {Esprit Research Project 6360}, Language = {English}, Title = {Transaction model vs Virtual Synchrony model: bridging the gap}, Type = {Broadcast Technical Report}, Year = {1995}} @inproceedings{GS96d, Address = {Sendai, Japan}, Author = {Guerraoui, R. and Schiper, A.}, Booktitle = {Proceedings of the $26^{th}$ International Symposium on Fault-Tolerant Computing (FTCS-26)}, Date-Modified = {2006-05-17 16:14:17 +0900}, Keywords = {consensus, service}, Language = {English}, Pages = {168--177}, Title = {Consensus Service: {A} Modular Approach For Building Fault-Tolerant Agreement Protocols in Distributed Systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/FTCS.1996.534604}, Year = {1996}, Abstract = {We describe a consensus service and suggest its use for the construction of fault-tolerant agreement protocols. We show how to build agreement protocols, using a classical client-server interaction, where: the clients are the processes that must solve the agreement problem; and the servers implement the consensus service. Using a generic notion, called consensus filter, we illustrate our approach on non-blocking atomic commitment and on view synchronous multicast. The approach can trivially be used for total order broadcast. In addition of its modularity, our approach enables efficient implementations of the protocols, and precise characterization of their liveness.}} @inproceedings{GS96e, Author = {Guerraoui, R. and Schiper, A.}, Booktitle = {Reliable Software Technologies - Ada-Europe'96}, Date-Modified = {2006-05-17 16:56:35 +0900}, Isbn = {3-540-61317-X}, Keywords = {fault-tolerance, distributed systems}, Language = {English}, Pages = {38--57}, Publisher = {Springer-Verlag}, Series = {LNCS~1088}, Title = {Fault-Tolerance by Replication in Distributed Systems}, Url = {http://infoscience.epfl.ch/search.py?recid=50116&ln=fr}, Year = {1996}} @inproceedings{GS97, Address = {Saarbr{\"u}cken, Germany}, Author = {Guerraoui, R. and Schiper, A.}, Booktitle = {Proceedings of the $11^{th}$ International Workshop on Distributed Algorithms (WDAG-11)}, Date-Modified = {2006-05-17 16:21:13 +0900}, Isbn = {3-540-63575-0}, Keywords = {atomic broadcast}, Language = {English}, Pages = {141 - 154}, Publisher = {Springer-Verlag}, Title = {Genuine Atomic Multicast}, Url = {http://citeseer.ist.psu.edu/guerraoui97genuine.html}, Year = {1997}} @article{GS97c, Author = {Guerraoui, R. and Schiper, A.}, Date-Modified = {2006-05-17 16:23:44 +0900}, Issn = {0018-9162}, Journal = {{IEEE} Computer}, Keywords = {replication, fault-tolerance}, Language = {English}, Month = {April}, Number = {4}, Pages = {68--74}, Title = {Software-Based Replication for Fault Tolerance}, Url = {http://doi.ieeecomputersociety.org/10.1109/2.585156}, Volume = {30}, Year = {1997}, Abstract = {Developers of early distributed systems took a simplistic approach to providing fault tolerance: They just used another copy of the same hardware as a backup. Later, others developed replication software to work on off-the-shelf hardware. Since neither of these methods is especially economical, a logical course is to take it one step further and eliminate the extra hardware altogether. Fully software-based replication relies on sophisticated techniques to keep track of server communications and ensure the consistency of information across several server replicas. How do you know that each server shares the same view of the data or program semantics? What happens if a server replica crashes? How do you make sure that a system processes invocations in the correct order? These are all problems that a replication technique has to handle. The authors describe two fundamental techniques, primary-backup and active replication, and illustrate how they handle these problems. At this point, both have advantages and disadvantages that depend on the application. The authors also propose that group communication provides a sufficient framework for implementing software-based replication. The concept of static and dynamic groups proves useful in thinking about how to implement replication techniques. Replication techniques can also use total-order and view-synchronous multicast primitives from group communication.}} @inproceedings{GSEBW94, Address = {New York, {NY}, {USA}}, Affiliation = {Digital Equipment Corp., San Francisco, CA, USA}, Author = {Gray, J. and Sundaresan, P. and Englert, S. and Baclawski, K. and Weinberger, P. J.}, Booktitle = {{SIGMOD} Record ({ACM} Special Interest Group on Management of Data)}, Date-Modified = {2006-04-11 10:35:24 +0900}, Issn = {0163-5808}, Keywords = {database, performance, large-scale}, Language = {English}, Number = {2}, Pages = {243--252}, Publisher = {{ACM} Press}, Title = {Quickly generating billion-record synthetic databases}, Volume = {23}, Year = {1994}} @article{Gaertner99, Author = {G{\"{a}}rtner, F. C.}, Date-Modified = {2006-03-08 19:04:41 +0900}, Journal = {{ACM} Computing Surveys}, Keywords = {distributed computing, fault-tolerance}, Language = {English}, Number = {1}, Pages = {1--26}, Title = {Fundamentals of fault-tolerant distributed computing in asynchronous environments}, Url = {http://www.acm.org/pubs/articles/journals/surveys/1999-31-1/p1-gartner/p1-gartner.pdf}, Volume = {31}, Year = {1999}} @phdthesis{Gar98, Address = {Switzerland}, Author = {Garbinato, B.}, Language = {English}, Number = {1801}, School = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Title = {Protocol Objects \& Patterns for Structuring Reliable Distributed Systems}, Url = {http://lsewww.epfl.ch/Documents/postscript/Gar98.ps}, Year = {1998}} @techreport{Gart01, Author = {G{\"a}rtner, F.~C.}, Date-Modified = {2006-07-14 18:04:16 +0900}, Institution = {Darmstadt University of Technology}, Keywords = {failure detection, time model}, Language = {English}, Number = {TUD-BS-2001-01}, Title = {A gentle introduction to failure detectors and related problems}, Url = {http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2001-01.ps.gz}, Year = {2001}, Annote = {A more informal introduction to defining and using{\newline} unreliable failure detectors~\cite{CT96} in the design{\newline} and analysis of fault tolerant distributed{\newline} algorithms.}} @inproceedings{Gifford79, Address = {Asilomar Conference Grounds, Pacific Grove CA}, Author = {Gifford, D. K.}, Booktitle = {Proceedings of the Seventh Symposium on Operating System Principles {SOSP 7}}, Date-Modified = {2006-04-11 10:27:23 +0900}, Isbn = {0-89791-009-5}, Keywords = {quorum}, Language = {English}, Pages = {150--162}, Publisher = {{ACM}, New York}, Title = {Weighted Voting for Replicated Data}, Year = {1979}} @article{Gol94, Alternate-Key = {Goldring94}, Author = {Goldring, R.}, Journal = {Info DB}, Keywords = {Database Replication}, Language = {English}, Number = {8}, Title = {A Discussion of Database Replication Technology}, Volume = {1}, Year = {1994}} @inproceedings{Goldring95, Address = {San Jose, CA USA}, Author = {Goldring, R.}, Booktitle = {Proceedings of the 1995 ACM SIGMOD international conference on Management of data}, Date-Modified = {2006-03-08 15:23:54 +0900}, Keywords = {Database, Replication}, Language = {English}, Pages = {439--440}, Title = {Things every update replication customer should know (abstract)}, Url = {http://www.acm.org/pubs/articles/proceedings/mod/223784/p439-goldring/p439-goldring.pdf}, Year = {1995}} @incollection{Gray78, Address = {Heidelberg, FRG and NewYork NY, USA}, Author = {Gray, J. N.}, Booktitle = {Operating Systems, an Advanced Course}, Edition = {LNCS}, Editor = {Bayer and Graham and Seegmuller}, Language = {English}, Publisher = {Springer Verlag}, Title = {Notes on Data Base Operating Systems}, Volume = {60}, Year = {1978}} @techreport{Gray85, Address = {Cupertino, CA, USA}, Author = {Gray, J.}, Date-Modified = {2005-09-28 15:54:09 +0900}, Institution = {Tandem Computers Inc.}, Language = {English}, Number = {85.7}, Title = {Why do Computers Stop and What can be done about it}, Url = {http://athos.rutgers.edu/~rmartin/teaching/spring03/cs553/readings/gray85.pdf}, Year = {1985}, Abstract = {An analysis of the failure statistics of a commercially available fault-tolerant system shows that administration and software are the major contributors to failure. Various approaches to software fault-tolerance are then discussed - notably process-pairs, transactions and reliable storage. It is pointed out that faults in production software are often (transient) and that a transaction mechanism combined with persistent process-pairs provides fault-tolerant execution - the key to software fault-tolerance.}} @article{Gray90, Author = {Gray, J.}, Date-Modified = {2006-03-08 19:07:36 +0900}, Journal = {IEEE Transactions on Reliability}, Keywords = {reliability}, Language = {English}, Number = {4}, Pages = {409--417}, Title = {A Census of Tandem System Availability Between 1985 and 1990}, Volume = {39}, Year = {1990}} @inbook{Gray94, Author = {Gray, J.~N. and Lorie, R. and Putzolu, G. and Traiger, I.}, Chapter = {3, Granularity of Locks and Degrees of Consistency in a Shared Database}, Edition = {3rd}, Isbn = {1-55860-523-1}, Language = {English}, Publisher = {Morgan Kaufmann}, Series = {Data Management Systems}, Title = {Readings in Database Systems}, Year = {1994}, Annote = {edited by Michael Stonebraker}} @inproceedings{Gue95, Address = {Le Mont-St-Michel, France}, Author = {Guerraoui, R.}, Booktitle = {Proceedings of the $9^{th}$ International Workshop on Distributed Algorithms (WDAG-9)}, Date-Modified = {2006-04-11 10:25:28 +0900}, Keywords = {atomic commitment, consensus}, Language = {English}, Pages = {87--100}, Publisher = {Springer-Verlag}, Series = {LNCS~972}, Title = {Revisiting the Relationship between Non-Blocking Atomic Commitment and Consensus}, Url = {http://lsewww.epfl.ch/Publications/ById/53.html}, Year = {1995}} @techreport{Guerraoui96, Address = {France}, Author = {Guerraoui, R.}, Institution = {Universit\'{e} Joseph Fourrier (Grenoble 1)}, Language = {French}, Title = {Transactions {R}\'{e}parties, Syst\`{e}mes et Languages}, Url = {http://lsewww.epfl.ch/~rachid/papers/habilit.ps}, Year = {1996}} @inproceedings{HAA00, Address = {N{\"{u}}rnberg, Germany}, Author = {Holliday, J. and Agrawal, D. and El~Abbadi, A.}, Booktitle = {Proceedings of the $19^{th}$ Symposium on Reliable Distributed Systems {SRDS'2000}}, Date-Modified = {2006-03-08 15:25:16 +0900}, Isbn = {0-7695-0543-0}, Issn = {1060-9857}, Keywords = {database, replication, atomic broadcast, deadlock}, Language = {English}, Pages = {196--205}, Publisher = {{IEEE} Computer Society, Los Alamitos, California}, Title = {Using Multicast Communication to Reduce Deadlocks in Replicated Databases}, Url = {http://citeseer.nj.nec.com/holliday00using.html}, Year = {2000}, Annote = {shows abcast based replication yields less deadlocks}} @inproceedings{HAA02, Address = {Vienna Austria}, Author = {Holliday, J. and Agrawal, D. and El~Abbadi, A.}, Booktitle = {Proceedings, the $22^{nd}$ International Conference on Distributed Computing Systems, (ICDCS2002)}, Language = {English}, Organization = {IEEE}, Title = {Partial Database Replication using Epidemic Communication}, Url = {http://www.cse.scu.edu/~jholliday/partial.ps}, Year = {2002}} @inproceedings{HAA99, Author = {Holliday, J. and Agrawal, D. and Abbadi, A. El}, Booktitle = {Proceedings of International Symposium on Fault Tolerant Computing ({FTCS}29)}, Date-Modified = {2006-04-11 10:40:08 +0900}, Keywords = {database replication, atomic broadcast}, Language = {English}, Organization = {{IEEE} Computer Society}, Pages = {158--165}, Title = {The Performance of Database Replication with Group Multicast}, Url = {http://www.cs.ucsb.edu/~joanne46/ftcs29.ps}, Year = {1999}} @techreport{HAA99b, Address = {Santa Barbara {USA}}, Author = {Holliday, J. and Agrawal, D. and Abbadi, A. El}, Date-Modified = {2006-04-11 10:42:21 +0900}, Institution = {Computer Science Department, University of California,}, Keywords = {database replication, atomic broadcast}, Language = {English}, Note = {long form of~\cite{HAA99}}, Number = {TRCS99-11}, Title = {The Performance of Replicated Databases using Atomic Broadcast Group Communication}, Url = {http://www.cs.ucsb.edu/TRs/techreports/TRCS99-11.ps}, Year = {1999}} @inproceedings{HAA99c, Author = {Holliday, J. and Agrawal, D. and El~Abbadi, A.}, Booktitle = {Proceedings of $18^{th}$ Symposium on Reliable Distributed Systems {SRDS'99}}, Date-Modified = {2006-04-11 10:39:32 +0900}, Institution = {Departement of Computer Science, University of California at Santa Barbara, Santa Barbara, {CA} 93106, {USA}}, Isbn = {0-7695-0290-3}, Issn = {1060-9857}, Keywords = {database replication, atomic broadcast, lazy replication}, Language = {English}, Pages = {304--305}, Publisher = {{IEEE} Computer Society Press}, Title = {Database Replication: If You Must be Lazy, be Consistent}, Year = {1999}} @techreport{HB96, Author = {Hayden, M. and Birman, K.}, Date-Modified = {2006-04-11 10:36:57 +0900}, Institution = {Cornell University, Computer Science}, Keywords = {atomic broadcast}, Language = {English}, Number = {TR96-1606}, Pages = {15}, Title = {Probabilistic Broadcast}, Year = {1996}} @inproceedings{HHMRF00, Address = {Taipe, Taiwan}, Author = {H\'{e}lary, J. M. - and Hurfin, M. and Mostafaoui, A. and Raynal, M. and Tronel, F.}, Booktitle = {Proceedings of $20^{th}$ International Conference on Distributed Computing Systems ({ICDCS}'2000)}, Editor = {{IEEE}}, Language = {English}, Organization = {{IEEE}}, Pages = {584--591}, Title = {Computing Global Functions in Asynchronous Distributed Systems Prone to Process Crashes}, Year = {2000}} @article{HR83, Author = {H{\"{a}}rder, T. and Reuter, A.}, Date-Modified = {2006-03-08 15:23:08 +0900}, Issn = {0360-0300}, Journal = {ACM Computing Surveys}, Keywords = {database, reliability, recovery}, Language = {English}, Note = {Reprinted in~\cite{Stonebraker88}}, Number = {4}, Pages = {287--317}, Title = {Principles of Transaction Oriented Database Recovery}, Url = {http://www.acm.org/pubs/citations/journals/surveys/1983-15-4/p287-haerder/}, Volume = {15}, Year = {1983}} @techreport{HR93, Address = {AFIT/ENG, 2950 P St, WPAFB OH 45433-7765}, Author = {Halloran, T. J. and Roth, M. A.}, Date-Modified = {2006-03-08 15:24:04 +0900}, Institution = {{US} Air Force}, Keywords = {database, performance}, Language = {English}, Number = {AFIT/EN-TR-93-5}, Title = {``Magic mirror on the wall, who's the fastest Database of them all?'' {A} Survey of Database Benchmarks}, Url = {ftp://ftp.afit.af.mil/pub/techreports/tr-93-05.ps}, Year = {1993}} @inproceedings{HS00, Address = {N{\"{u}}rnberg, Germany}, Author = {Hiltunen, M.~A. and Schlichting, R.~D.}, Booktitle = {Proceedings of the Workshop on Dependable System Middleware and Group Communication (DSMGC 2000)}, Date-Modified = {2006-04-11 10:40:46 +0900}, Keywords = {group communication, toolkit}, Language = {English}, Project-Url = {http://www.cs.arizona.edu/cactus/}, Title = {The Cactus Approach to Building Configurable Middleware Services}, Url = {ftp://ftp.cs.arizona.edu/ftol/papers/dsmgc00.pdf}, Year = {2000}} @inproceedings{HS93, Author = {Hiltunen, M.~A. and Schlichting, R.~D.}, Booktitle = {Proceedings of the $12^{th}$ Symposium on Reliable Distributed System (SRDS'93)}, Language = {English}, Organization = {IEEE}, Pages = {105--114,}, Title = {An Approach to Constructing Modular Fault-Tolerant Protocols}, Url = {ftp://ftp.cs.arizona.edu/ftol/papers/srds12.ps}, Year = {1993}} @incollection{HT93, Alt-Key = {Hadzilacos93}, Author = {Hadzilacos, V. and Toueg, S.}, Booktitle = {Distributed Systems}, Chapter = {5}, Date-Modified = {2006-04-11 10:38:12 +0900}, Edition = {second}, Editor = {Mullender, Sape}, Keywords = {atomic broadcast}, Language = {English}, Publisher = {Addison-Wesley}, Title = {Fault-Tolerant Broadcasts and Related Problems}, Year = {1993}} @techreport{HT94, Author = {Hadzilacos, V. and Toueg, S.}, Date-Modified = {2006-08-01 14:00:04 +0900}, Institution = {Cornell University, Computer Science Department}, Keywords = {atomic broadcast, reliable broadcast}, Language = {English}, Number = {TR94-1425}, Pages = {83}, Title = {A Modular Approach to Fault-Tolerant Broadcasts and Related Problems}, Type = {Technical Report}, Url = {http://citeseer.ist.psu.edu/hadzilacos94modular.html}, Year = {1994}} @article{HW90, Abstract-Url = {http://www.acm.org/pubs/toc/Abstracts/0164-0925/78972.html}, Author = {Herlihy, M.~P. and Wing, J.~ M.}, Date-Modified = {2006-03-08 19:04:49 +0900}, Issn = {0164-0925}, Journal = {ACM Transactions on Programming Languages and Systems}, Keywords = {consistency}, Language = {English}, Number = {3}, Pages = {463--492}, Title = {Linearizability: {A} Correctness Condition for Concurrent Objects}, Url = {http://www.cs.ucsb.edu/~ambuj/Courses/290I/linear.pdf}, Volume = {12}, Year = {1990}} @article{Had88, Affiliation = {University of Toronto, Toronto, Ontario, Canada}, Author = {Hadzilacos, V.}, Date-Modified = {2006-03-08 15:23:21 +0900}, Issn = {0004-5411), url = (http://www.acm.org/pubs/articles/journals/jacm/1988-35-1/p121-hadzilacos/p121-hadzilacos.pdf}, Keywords = {database, concurency control, reliability}, Language = {English}, Number = {1}, Pages = {121--145}, Title = {A Theory of Reliability in Database Systems}, Volume = {35}, Year = {1988}, Annote = {Concepts of commit serializability, recoverability,{\newline} and resiliency. Principles of reliable transaction.{\newline} Specification.}} @article{Hadzilacos88, Author = {Hadzilacos, V.}, Date-Modified = {2006-03-08 19:11:30 +0900}, Journal = {Journal of the {ACM}}, Keywords = {reliability, database}, Language = {English}, Month = {January}, Number = {1}, Pages = {121--145}, Title = {A Theory of Reliability in Database Systems}, Url = {http://citeseer.ist.psu.edu/context/437301/0}, Volume = {35}, Year = {1988}} @techreport{Hep95, Address = {Geelong, Victoria, Australia}, Alt-Url = {http://citeseer.nj.nec.com/hepner95integrating.html}, Author = {Hepner, P.}, Institution = {School of Computing and Mathematics, Deakin University}, Language = {English}, Number = {C95/31}, Title = {Integrating Heterogenous Databases: An Overview}, Url = {ftp://ftp.cm.deakin.edu.au/pub/TR/Computing/TR-C95-30.ps.gz}, Year = {1995}} @inproceedings{Hol01, Address = {Cambridge, MA, USA}, Alt-Url = {http://citeseer.nj.nec.com/451497.html}, Author = {Holliday, J.}, Booktitle = {Proceedings of the Symposium on Network Computing and Applications (NCA'01)}, Isbn = {0-7695-1432-4}, Language = {English}, Organization = {IEEE}, Pages = {104--107}, Title = {Replicated Database Recovery Using Multicast Communications}, Url = {http://www.cse.scu.edu/~jholliday/NCAtechrep.ps}, Year = {2001}, Annote = {url is extended version, probably technical report}} @article{ISP00, Author = {Ingham, D.~B. and Shrivastava, S.~K. and Panzieri, F.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Journal = {{IEEE} Internet Computing}, Keywords = {distributed systems, group communication, web services, fault-tolerance}, Language = {English}, Pages = {25--33}, Title = {Constructing Dependable Web Services}, Url = {http://archilab.sogang.ac.kr/pdf/Constructing_Dependable_Web.pdf}, Year = {2000}, Annote = {Good overview of the subject}} @manual{Informix98, Address = {4100 Bohannon Drive, Menlo Park, California 94025 {USA}}, Language = {English}, Organization = {Informix}, Title = {Informix Replication: {A} High-Performance Solution for Distributing and Sharing Information}, Type = {White paper}, Url = {http://www.informix.com/informix/whitepapers/entrep.pdf}, Year = {1998}, Annote = {Very commercial blurb - a little info on replication}} @techreport{Isis84, Author = {Birman, K P. and Abbadi, A. El and Dietrich, W. and Joseph, T. A. and Raeuchle, T.}, Date-Modified = {2006-03-08 19:26:23 +0900}, Institution = {Cornell University, Computer Science Department}, Keywords = {group communication, toolkit}, Language = {English}, Number = {TR84-642}, Pages = {15}, Title = {An Overview of the {I}sis Project}, Type = {Technical Report}, Year = {1984}} @book{JAI91, Address = {New York {USA}}, Author = {Jain, R.}, Language = {English}, Publisher = {John Wiley and Sons, Inc.}, Title = {The art of computer system performance analysis: techniques for experimental design, measurement, simulation and modeling}, Year = {1991}} @article{JB86, Author = {Joseph, T. A and Birman, K. P}, Keywords = {Replication}, Language = {English}, Number = {39}, Title = {Low Cost Management of Replicated Data in Fault-Tolerant Distributed Systems}, Volume = {4}, Year = {1986}} @inproceedings{JPA01, Author = {Jim{\'e}nez-Peris, R. and Pati{\~n}o-Mart{\`\i}nez, M. and Alonso, G.}, Booktitle = {Proceedings of the $4^{th}$ CaberNet Workshop}, Date-Modified = {2006-08-01 14:04:28 +0900}, Keywords = {atomic broadcast}, Language = {English}, Location = {Pisa, Italy}, Title = {Is reliable multicast too expensive? Let's be optimistic}, Url = {http://www.inf.ethz.ch/personal/alonso/PAPERS/CA-WS-01.pdf}, Year = {2001}} @inproceedings{JPAA01, Address = {Lisbon, Portugal}, Author = {Jim\'{e}nez-Paris, R. and Pati{\~{n}}o-Mart{\'{\i}}nez, M. and Alonso, G. and Ar\'{e}valo, S.}, Booktitle = {Proceeedings of the $15^{th}$ Internationnal Conference on Distributed Computing (DISC 2001)}, Date-Modified = {2006-03-08 19:35:02 +0900}, Editor = {Welch, J.}, Isbn = {3-540-42605-1}, Issn = {0320-9743}, Keywords = {atomic commitment}, Language = {English}, Pages = {93--107}, Publisher = {Springer Verlag}, Series = {lecture notes on computer science}, Title = {A Low Latency Non-blocking Commit Server}, Volume = {2180}, Year = {2001}} @inproceedings{JPPM+01, Address = {New Orleans, LA, USA}, Author = {Jim\'{e}nez-Peris, R. and Pati{\~{n}}o-Mart\'{\i}nez, M. and Alonso, G. and Kemme, B.}, Booktitle = {Proceeding of the $20^{th}$ Symposium on Reliable Distributed Systems}, Isbn = {0-7695-1366-2}, Issn = {1060-9857}, Language = {English}, Pages = {24--33}, Publisher = {IEEE Computer Society, Los Alamitos, California, USA}, Title = {How to Select a Replication Protocol According to Scalability, Availability and Communication Overhead}, Year = {2001}} @inproceedings{Jagadish97, Address = {Birmingham {UK}}, Author = {Jagadish, H. V. and Mumick, I. S. and Rabinovich, M.}, Booktitle = {Proceedings of the Thirteenth International Conference on Data Engineering}, Language = {English}, Pages = {520--531}, Publisher = {{IEEE} Computer Society Press}, Title = {Scalable Versioning in Distributed Databases with Commuting Updates}, Year = {1997}} @article{Jajodia99, Author = {Jajodia, S.}, Date-Modified = {2006-03-08 19:04:13 +0900}, Journal = {{IEEE} Concurency}, Keywords = {database, replication}, Language = {English}, Note = {Interview of {Y}uri {B}reitbart and {H}ank {K}orth}, Pages = {85--86}, Title = {Data Replication gaining Popularity}, Year = {1999}} @article{Jefferson85, Author = {Jefferson, D. R.}, Date-Modified = {2006-03-08 19:15:06 +0900}, Journal = {{ACM} Transactions on Programming Languages and Systems}, Keywords = {time model}, Language = {English}, Number = {3}, Pages = {404--425}, Title = {Virtual Time}, Url = {http://www.acm.org/pubs/articles/journals/toplas/1985-7-3/p404-jefferson/p404-jefferson.pdf}, Volume = {7}, Year = {1985}} @inproceedings{KA00, Address = {Cairo, Egypt}, Author = {Kemme, B. and Alonso, G.}, Booktitle = {Proceedings of the $26^{th}$ International Conference on Very Large Databases ({VLDB})}, Date-Modified = {2006-04-11 10:43:16 +0900}, Keywords = {database replication, atomic broadcast}, Language = {English}, Title = {Don't be lazy, be consistent: {P}ostgres-{R}, a new way to implement Database Replication}, Url = {http://www.inf.ethz.ch/department/IS/iks/publications/files/ka00.pdf}, Year = {2000}} @article{KA00b, Author = {Kemme, B. and Alonso, G.}, Date-Modified = {2006-05-17 16:43:54 +0900}, Journal = {ACM Transactions on Database Systems}, Keywords = {database replication, fault-tolerance}, Language = {English}, Number = {3}, Pages = {333--379}, Title = {A New Approach to Developing and Implementing Eager Database Replication Protocols}, Url = {http://www.acm.org/pubs/articles/journals/tods/2000-25-3/p333-kemme/p333-kemme.pdf}, Volume = {25}, Year = {2000}} @inproceedings{KA98, Address = {Amsterdam, The Netherlands}, Author = {Kemme, B. and Alonso, G.}, Booktitle = {Proceedings of the $18^{th}$ International Conference on Distributed Computing Systems (ICDCS'98)}, Date-Modified = {2006-05-17 17:04:07 +0900}, Keywords = {Dragon, database replication, group communication}, Language = {English}, Title = {A Suite of Database Replication Protocols based on Group Communication Primitives}, Url = {http://www.inf.ethz.ch/department/IS/iks/publications/files/ka98a.ps.gz}, Year = {1998}} @inproceedings{KA99, Abstract-Url = {http://www.inf.ethz.ch/department/IS/iks/publications/ka99b.html}, Address = {Madeira Island (Portugal)}, Author = {Kemme, B. and Alonso, G.}, Booktitle = {$3^{rd}$ Europeean Research Seminar on Advances in Distributed Systems ({ERSADS'99})}, Language = {English}, Organization = {{BROADCAST} {E}sprit WG 22455}, Title = {Transactions, Messages and Events: Merging Group Communication and Database System}, Url = {http://www.inf.ethz.ch/department/IS/iks/publications/files/ka99b.ps.gz}, Year = {1999}} @article{KB94, Affiliation = {State Univ. of New York, Stony Brook, NY, USA}, Author = {Krishnakumar, N. and Bernstein, A. J.}, Issn = {0362-5915}, Keywords = {algorithms; performance; theory}, Language = {English}, Number = {4}, Pages = {586--625}, Title = {Bounded Ignorance: {A} Technique for Increasing Concurrency in a Replicated System}, Url = {http://www.acm.org/pubs/toc/Abstracts/tods/195670.html}, Volume = {19}, Year = {1994}} @inproceedings{KBB01, Alternate-Key = {Kemme01}, Author = {Kemme, B. and Bartoli, A. and Babao{\u{g}}lu, {\"O}.}, Booktitle = {Proc. of the Int. Conf. on Dependable Systems and Networks (DSN)}, Date-Modified = {2006-09-15 15:21:26 +0900}, Keywords = {database replication, recovery, group communication}, Language = {English}, Location = {G{\"{o}}teborg, Sweden}, Organization = {IEEE}, Pages = {117--127}, Title = {Online Reconfiguration in Replicated Databases Based on Group Communication}, Url = {http://www.cs.mcgill.ca/~kemme/papers/dsn01.pdf}, Year = {2001}, Abstract = {Over the last years, many replica control protocols have been developed that take advantage of the ordering and reliability semantics of group communication primitives to simplify database system design and to improve performance. Although current solutions are able to mask site failures effectively, many of them are unable to cope with recovery of failed sites, merging of partitions, or joining of new sites. This paper addresses this important issue. It proposes efficient solutions for online system reconfiguration providing new sites with a current state of the database without interrupting transaction processing in the rest of the system. Furthermore, the paper analyzes the impact of cascading reconfigurations, and argues that they can be handled in an elegant way by extended forms of group communication.}} @inproceedings{KLS94, Address = {Gaithersburg, MD USA}, Author = {Kirsche, T. and Lenz, R. and Schuster, H.}, Booktitle = {Proceedings of the third international conference on Information and knowledge management}, Date-Modified = {2006-03-08 15:23:43 +0900}, Keywords = {Database, Cooperation}, Language = {English}, Pages = {384--391}, Title = {Functionality and Architecture of a Coopoerative Database System - {A} Vision -}, Url = {http://www.acm.org/pubs/articles/proceedings/cikm/191246/p384-kirsche/p384-kirsche.pdf}, Year = {1994}} @techreport{KLW96, Author = {Kindler, E. and Listl, A. and Walter, R.}, Institution = {Humboldt-Universit{\"a}t zu Berlin Germany}, Language = {English}, Number = {56}, Title = {A Specification Method for Transaction Models with Data Replication}, Url = {http://www.informatik.hu-berlin.de/~kindler/PostScript/HUIB56.ps}, Year = {1996}} @inproceedings{KPAS99, Abstract-Url = {http://www.inf.ethz.ch/department/IS/iks/publications/kpas99a.html}, Address = {Austin, Texas}, Author = {Kemme, B. and Pedone, F. and Alonso, G. and Schiper, A.}, Booktitle = {Proceedings of the International Conference on Distributed Computing Systems}, Language = {English}, Title = {Processing Transactions over Optimistic Atomic Broadcast Protocols}, Url = {http://www.inf.ethz.ch/department/IS/iks/publications/files/kpas99a.ps.gz}, Year = {1999}} @techreport{KPAS01, Author = {Kemme, B. and Pedone, F. and Alonso, G. and Schiper, A. and M., Wiesmann}, Date-Modified = {2006-08-01 13:37:55 +0900}, Institution = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Language = {English}, Number = {DSC/2001/053}, Title = {Using Optimistic Atomic Broadcast in Transaction Processing Systems}, Url = {http://infoscience.epfl.ch/search.py?recid=49967}, Year = {2001}, Abstract = {Atomic broadcast primitives are often proposed as a mechanism to allow fault-tolerant cooperation between sites in a distributed system. Unfortunately, the delay incurred before a message can be delivered makes it difficult to implement high performance, scalable applications on top of atomic broadcast primitives. Recently, a new approach has been proposed for atomic broadcast which, based on optimistic assumptions about the communication system, reduces the average delay for message delivery to the application. In this paper, we develop this idea further and show how applications can take even more advantage of the optimistic assumption by overlapping the coordination phase of the atomic broadcast algorithm with the processing of delivered messages. In particular, we present a replicated database architecture that employs the new atomic broadcast primitive in such a way that communication and transaction processing are fully overlapped, providing high performance without relaxing transaction correctness.}} @article{KS93, Author = {Kumar, A. and Segev, A.}, Date-Modified = {2006-03-08 19:05:08 +0900}, Keywords = {concurency control, replication}, Language = {English}, Number = {1}, Pages = {102--131}, Title = {Cost and Availability Tradeoffs in Replicated Concurrency Control}, Volume = {18}, Year = {1993}} @inproceedings{KT91, Address = {Washington, D.C., {USA}}, Alt-Key = {Kaashoek91}, Author = {Kaashoek, M. F. and Tanenbaum, A. S.}, Booktitle = {Proceedings of the $11^{th}$ International Conference on Distributed Computing Systems {ICDCS}}, Date-Modified = {2006-08-01 14:18:11 +0900}, Isbn = {0-8186-2144-3}, Keywords = {toolkit, group communication}, Language = {English}, Organization = {IEEE}, Pages = {222--230}, Title = {Group Communication in the {A}m{\oe}ba Distributed Operating System}, Year = {1991}, Abstract = {Primitives for broadcast communication that have been integrated with the Amoeba distributed operating system are introduced. The semantics of the broadcast primitives are simple and easy to understand, but are still powerful. The proposed primitives, for example, guarantee global ordering of broadcast messages. The proposed primitives are also efficient: a reliable broadcast can be done in just slightly more than two messages, so the performance is comparable to a remote procedure call. In addition, the primitives are flexible; user applications can, for example, trade performance against fault-tolerance}} @techreport{KT94, Author = {Kaashoek, M. F. and Tanenbaum, A. S.}, Date-Modified = {2006-08-01 14:05:47 +0900}, Institution = {M.I.T., Cambridge, and Vrije Universiteit, Amsterdam}, Keywords = {group communication}, Language = {English}, Title = {Efficient Reliable Group Communication for Distributed Systems}, Year = {1994}} @article{Katz90, Author = {Katz, R. H.}, Journal = {ACM Computing Surveys}, Language = {English}, Title = {Toward a Unified Framework for Version Modeling in Engineering Databases}, Year = {1990}} @mastersthesis{Keidar94, Abstract-Url = {http://theory.lcs.mit.edu/~idish/Abstracts/keidar-msc.html}, Address = {Jerusalem, Israel}, Alt-Url = {http://www.cs.huji.ac.il/~transis/ftp/thesis/keidar-msc.ps.gz}, Author = {Keidar, I.}, Date-Modified = {2006-08-23 18:18:49 +0900}, Keywords = {replication}, Language = {English}, Note = {also technical report CS94}, School = {The Hebrew University of Jerusalem}, Title = {A Highly Available Paradigm for Consistent Object Replication}, Url = {http://theory.lcs.mit.edu/~idish/ftp/keidar-msc.ps.gz}, Year = {1994}} @inproceedings{Keleher99, Abstract-Url = {http://www.cs.umd.edu/~keleher/abstracts/podcDeno.html}, Address = {Atlanta, Georgia, USA}, Author = {Keleher, P.~J.}, Author-Url = {http://www.cs.umd.edu/~keleher/}, Booktitle = {Proceedings of the $18^{th}$ Symposium on Principles of Distributed Computing (PODC)}, Keywords = {DENO, lazy replication}, Language = {English}, Organization = {ACM SIGAT-SIGOPS}, Pages = {143--151}, Slides-Url = {http://www.cs.umd.edu/~keleher/talks/podc99.ppt}, Title = {Decentralized Replicated-Object Protocols}, Url = {http://www.cs.umd.edu/~keleher/papers/podc99.pdf}} @phdthesis{Kem00, Abstract-Url = {http://www.cs.mcgill.ca/~kemme/papers/phd.html}, Address = {Switzerland}, Author = {Kemme, B.}, Keywords = {Dragon}, Language = {English}, Note = {No. 13864}, Number = {13864}, School = {Swiss Federal Institute of Technology Z{\"{u}}rich}, Title = {Database Replication for Clusters of Workstations}, Url = {http://www.cs.mcgill.ca/~kemme/papers/phd-dina4.pdf}, Year = {2000}} @inproceedings{Kim00, Address = {N{\"{u}}rnberg Germany}, Affiliation = {{DREAM} Laboratory, University of California}, Affiliation-Address = {Irvine CA 92697 USA}, Affiliation-Url = {http://dream.eng.uci.edu}, Author = {Kim, K.~H.}, Booktitle = {Proceedings of the $19^{th}$ Symposium on Reliable Distributed Systems}, Date-Modified = {2005-11-02 15:17:46 +0900}, Language = {English}, Note = {Invited Paper}, Organization = {IEEE Computer Society}, Pages = {106--115}, Title = {Issues Insufficiently Resolved in Century 20 in the Fault-Tolerant Distributed Computing Field}, Url = {http://dream.eng.uci.edu/TMO/pdf/srds2000.pdf}, Year = {2000}} @article{Kim84, Affiliation = {IBM Research Laboratory}, Author = {Kim, W.}, Date-Modified = {2006-03-08 19:07:04 +0900}, Journal = {{ACM} Computing Surveys (CUSR)}, Keywords = {database, replication}, Language = {English}, Number = {1}, Pages = {71--98}, Title = {Highly available systems for database applications}, Volume = {16}, Year = {1984}} @article{Kung81, Author = {Kung, H. T. and Robinson, John T.}, Date-Modified = {2006-03-08 19:12:55 +0900}, Issn = {0362-5915}, Keywords = {concurency control, optimistic technique}, Language = {English}, Note = {Reprinted in \cite{Stonebraker:1988:RDS}.}, Number = {2}, Pages = {213--226}, Title = {On Optimistic Methods for Concurrency Control}, Volume = {6}, Year = {1981}} @inproceedings{LAA94, Address = {Los Alamitos, CA, USA}, Alt-Key = {Liu94}, Author = {Liu, M. L. and Agrawal, D. and El~Abbadi, A.}, Booktitle = {Proceedings of the $24^{th}$ Annual International Symposium on Fault-Tolerant Computing}, Date-Modified = {2006-04-11 10:37:31 +0900}, Isbn = {0-8186-3680-7}, Keywords = {atomic commitment, 2pc}, Language = {English}, Pages = {234--243}, Publisher = {IEEE Computer Society Press}, Title = {The Performance of Two-Phase Commit Protocols in the Presence of Site Failures}, Year = {1994}} @techreport{LAA94b, Author = {Liu, M. L. and Agrawal, D. and Abbadi, A. El}, Institution = {University of California, Santa Barbara. Computer Science.}, Language = {English}, Number = {TRCS94-09}, Title = {The Performance of Two-phase Commit Protocols in the Presence of Site Failures}, Type = {Technical Report}, Url = {ftp://ftp.cs.ucsb.edu/pub/techreports/TRCS94-09.ps}, Year = {1994}} @techreport{LAA97, Author = {Liu, M. L. and Agrawal, D. and Abbadi, A. El}, Institution = {Computer Science Departement, University of California, Santa Barbara}, Language = {English}, Month = {20,}, Number = {TRCS94-14}, Title = {What Price Replication?}, Type = {Technical Report}, Url = {ftp://ftp.cs.ucsb.edu/pub/techreports/TRCS94-14.ps}, Year = {1994}} @article{LHD98, Author = {Liu, X. and Helal, A. and Du, W.}, Date-Modified = {2006-03-08 15:22:49 +0900}, Keywords = {Database, scalability}, Language = {English}, Number = {2}, Pages = {158--198}, Title = {Multiview access protocols for large-scale replication}, Url = {http://www.acm.org/pubs/articles/journals/tods/1998-23-2/p158-liu/p158-liu.pdf}, Volume = {23}, Year = {1998}} @inproceedings{LL93, Address = {Dublin, Ireland}, Author = {Lampson, B. W. and Lomet, D. B.}, Author-Url = {http://research.microsoft.com/users/blampson/Publications.html}, Booktitle = {Proceedings of the $19^{th}$ International Conference on Very Large Databases}, Date-Modified = {2006-04-11 10:38:31 +0900}, Editor = {Agrawal, Rakesh and Baker, Se{\'a}n and Bell, David A.}, Isbn = {ISBN 1-55860-152-X}, Keywords = {atomic commitment, 2pc}, Language = {English}, Pages = {630--640}, Publisher = {Morgan Kaufmann}, Title = {A New Presumed Commit Optimization for Two Phase Commit}, Url = {http://research.microsoft.com/users/blampson/49-NewPCOpt/Acrobat.pdf}, Year = {1993}} @techreport{LL93b, Address = {One Kendall Square, Cambridge, Massachussets 02139}, Author = {Lampson, B. and Lomet, D.}, Date-Modified = {2006-08-01 13:16:12 +0900}, Institution = {Digital Cambridge Research Laboratory}, Keywords = {2PC}, Language = {English}, Number = {CRL 93/1}, Title = {A new presumed commit optimization for two phase commit}, Url = {ftp://crl.dec.com/pub/DEC/CRL/tech-reports/93.1.ps.Z}, Year = {1993}} @article{LLG92, Author = {Ladin, R. and Liskov, B. and Ghemawat, S.}, Date-Modified = {2006-03-08 19:07:59 +0900}, Keywords = {replication, lazy replication}, Language = {English}, Number = {4}, Pages = {360--391}, Title = {Providing High Availability Using Lazy Replication}, Volume = {10}, Year = {1992}} @article{LM97, Affiliation = {Olsen and Associates Z{\"{u}}rich Switzerland, Departement of Computer Science Washington University St Louis, USA}, Author = {Maffeis, S. and Schmidt, D.~C.}, Date-Modified = {2006-03-08 19:12:41 +0900}, Journal = {{IEEE} Communications Magazine}, Keywords = {group communication, CORBA}, Language = {English}, Number = {2}, Pages = {56--61}, Title = {Constructing Reliable Distributed Communication Systems with {CORBA}}, Url = {http://www.softwired-inc.com/people/maffeis/articles/research/ieeecomm.pdf}, Volume = {14}, Year = {1997}} @inproceedings{LS99, Affiliation = {Departement of Computing Science University of Newcastle, Newcastle upon Tyne, NE1 7RU, England}, Author = {Little, M. C. and Shrivastava, S. K.}, Booktitle = {Eighth International Workshop on Persistent Object Systems: Design Implementation and Use}, Language = {English}, Title = {Understanding the Role of Atomic Transactions and Group Communications in Implementing Persistent Objects}, Year = {1998}} @techreport{LTWH98, Address = {130 Lytton Avenue Palo Alta, California 94301 {USA}}, Author = {Lee, E.~K. and Thekkath, C.~A. and Whitaker, C. and J, Hogg}, Institution = {Digital Systems Research Center}, Language = {English}, Number = {155}, Title = {A comparison of Two Distributed Disk Systems}, Url = {http://gatekeeper.dec.com/pub/DEC/SRC/research-reports/abstracts/src-rr-155.html}, Year = {1998}} @techreport{Lam89, Address = {Palo Alto, USA}, Author = {Lamport, L.}, Institution = {System Research Center Digital Equipement Corp}, Language = {English}, Note = {A revised version was published in~\cite{Lam98}}, Title = {The Part-Time Parliament}, Url = {http://wilma.cs.brown.edu/courses/cs275/p133-lamport.pdf}, Year = {1989}} @article{Lam98, Alt-Url = {http://research.compaq.com/SRC/personal/lamport/pubs/lamport-paxos.ps.Z}, Author = {Lamport, L.}, Date-Modified = {2006-07-24 18:53:14 +0900}, Issn = {0734-2071}, Journal = {ACM Transactions on Computer Systems}, Keywords = {consensus, group communication, omega}, Language = {English}, Number = {2}, Pages = {133--169}, Title = {The Part-Time Parliament}, Url = {http://www.acm.org:80/pubs/citations/journals/tocs/1998-16-2/p133-lamport/}, Volume = {16}, Year = {1998}} @techreport{Lindsay79, Address = {San Jose Research Laboratory}, Author = {Lindsay, B.~G. and Selinger, P.~G. and Galtieri, C. and Gray, J.~N. and Lorie, R.~A. and Price, T.~G. and Potzulo, F. and Wade, B.~W.}, Institution = {IBM}, Language = {English}, Number = {RJ2571(33471)}, Title = {Notes on Distributed Databases}, Year = {1979}} @inproceedings{Lyon88, Author = {Lyon, J.}, Booktitle = {Proceedings of {IEEE} Compcon}, Date-Modified = {2006-08-01 14:08:19 +0900}, Keywords = {database replication}, Language = {English}, Title = {Design Considerations in Replicated Database Systems for Disaster Protection}, Year = {1988}} @inproceedings{Lyon90, Author = {Lyon, J.}, Booktitle = {Proceedings of {IEEE} Compcon}, Language = {English}, Title = {Tandem's Remote Data Facility}, Year = {1990}} @inproceedings{MBWSZ00, Address = {Cairo, Egypt}, Author = {Mohan, C. and Barber, R. and Watts, S. and Somani, A. and Zaharioudakis, M.}, Booktitle = {Proceedings of $26^{th}$ International Conference on Very Large Databases (VLDB'00)}, Editor = {Abbadi, A. El and Brodie, M.~L. and Chakravarthy, S. and Dayal, U. and Kamel, N. and Schlageter, G. and Whang, K.-Y.}, Isbn = {1-55860-715-3}, Publisher = {Morgan Kaufman}, Title = {Evolution of Groupware for Business Applications: {A} Database Perspective on Lotus Domino/Notes}, Url = {http://www.almaden.ibm.com/cs/people/mohan/Domino_VLDB2000.pdf}, Year = {2000}} @inproceedings{MFS+95, Author = {Malloth, C. P. and Felber, P. and Schiper, A. and Wilhelm, U.}, Booktitle = {Workshop on Parallel and Distributed Platforms in Industrial Products}, Date-Modified = {2006-08-01 13:43:31 +0900}, Keywords = {toolkit, group communication}, Language = {English}, Location = {San Antonio, Texas, USA}, Note = {Workshop held during the $7^{th}$ Symposium on Parallel and Distributed Processing, (SPDP-7)}, Organization = {IEEE}, Title = {Phoenix: {A} Toolkit for Building Fault-Tolerant Distributed Applications in Large Scale}, Url = {http://infoscience.epfl.ch/search.py?recid=50153&ln=fr}, Year = {1995}, Abstract = {Large scale systems are becoming more and more common today. There are many distributed applications emerging that use the capability of world-wide internetworking. Since a lot of applications need insurance of consistency even in the presence of failures, an adequate support for fault-tolerance is necessary. This can be provided by different paradigms and their implementations. Unfortunately, most of these implementations aim only local area networks, whereas our system, called Phoenix, will aim large scale where additional failure types have to be overcome. This paper shows the problems due to large scale, the limits of actual implementations, and our proposition to solve them.}} @article{MMABL96, Author = {Moser, L. E. and Melliar-Smith, P. M. and Agarwal, D. A. and Budhia, R. K. and Lingley-Papadopoulos, C. A.}, Date-Modified = {2006-04-04 18:52:35 +0900}, Journal = {Communications of the {ACM}}, Keywords = {atomic broadcast, group communication, toolkit}, Language = {English}, Number = {4}, Pages = {54--63}, Title = {Totem: a fault-tolerant multicast group communication system}, Url = {http://www.acm.org/pubs/articles/journals/cacm/1996-39-4/p54-moser/p54-moser.pdf}, Volume = {39}, Year = {1996}} @inproceedings{MMTBH95, Author = {Marzo, G. Di and Murhimanya, M. and Tschudin, C. F. and Billard, D. and Harms, J.}, Booktitle = {Proceedings of {ICC}'95 workshop on Intelligent Computer Communication}, Keywords = {Messengers,Mobile Computing}, Language = {English}, Title = {The Messager Paradigm and its implications on Distributed Systems}} @inproceedings{MMTBH97, Author = {Marzo, G. Di and Murhimanya, M. and Tschudin, C. F. and Billard, D. and Harms, J.}, Booktitle = {Proceedings of the 2nd European Research Seminar on Advances in Distributed Systems ({ERSADS}'97)}, Keywords = {Messengers,Mobile Computing}, Language = {English}, Pages = {111--116}, Title = {Communication Messengers as a Basic for Distributed Algorithms}, Year = {1997}} @article{MPM80, Author = {{D. A. Menasce, G. J. Popek}, R. R. Muntz}, Date-Modified = {2005-11-02 15:17:22 +0900}, Keywords = {distributed locking, 2PC}, Language = {English}, Number = {2}, Pages = {103--138}, Title = {A Locking Protocol for Resource Coordination in Distributed Databases}, Url = {http://www.acm.org/pubs/articles/journals/tods/1980-5-2/p103-menasce/p103-menasce.pdf}, Volume = {5}, Year = {1980}} @inproceedings{MPR01, Address = {Phoenix, Arizona, USA}, Author = {Miranda, H. and Pinto, A. and Rodrigues, L.}, Booktitle = {Proceedings of the $21^{st}$ International Conference on Distributed Computing Systems ({ICDCS}-01)}, Date-Modified = {2006-05-17 16:42:08 +0900}, Keywords = {toolkit, group communication}, Language = {English}, Pages = {707--710}, Publisher = {IEEE Computer Society}, Title = {Appia: {A} Flexible Protocol Kernel Supporting Multiple Coordinated Channels}, Url = {http://appia.di.fc.ul.pt/appiadocs/miranda01.ps.gz}, Year = {2001}} @inproceedings{MRBKS92, Address = {San Diego, CA USA}, Author = {Mehrotra, S. and Rastogi, R. and Breitbart, Y and Korth, H. F. and Silberschatz, A.}, Booktitle = {{PODS}'92 Proceedings of the eleventh {ACM SIGACT-SIGMOD-SIGART} Symposium on Principles of database systems}, Date-Modified = {2005-11-02 15:26:37 +0900}, Isbn = {0-89791-223-3}, Keywords = {federated database}, Language = {English}, Pages = {164--175}, Publisher = {ACM Press}, Title = {Ensuring Transaction Atomicity in Multidatabase Systems}, Url = {http://www.acm.org:80/pubs/articles/proceedings/pods/137097/p164-mehrotra/p164-mehrotra.pdf}, Year = {1992}} @inproceedings{MW97, Address = {Winnipeg, Canada}, Author = {Mishra, S. and Wu, L.}, Booktitle = {Proceedings of the 11th Annual International Symposium on High Performance Computing}, Language = {English}, Pages = {295--306}, Title = {Flow Control in High Performance Atomic Multicast Services}, Year = {1997}} @phdthesis{Mal96, Address = {Switzerland}, Author = {Malloth, C. P.}, Date-Modified = {2006-08-01 14:08:05 +0900}, Keywords = {toolkit, group communication}, Language = {French}, Number = {1557}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Conception and Implementation of a Toolkit for Building Fault-Tolerant Distributed Applications in Large Scale Networks}, Url = {http://library.epfl.ch/theses/?nr=1557}, Year = {1996}, Abstract = {Large scale systems are becoming more and more common today. Many distributed applications are emerging that use the capability of world-wide internetworking. Since many applications require availability and consistency in the presence of failures, an adequate support for fault-tolerance is necessary. This can be provided by different paradigms and their implementations. Unfortunately, most of these implementations consider only local area networks, whereas this thesis describes a system, called Phoenix, which aims at large scale networks where additional types of failure have to be taken into account. This thesis gives a complete description of Phoenix, a toolkit for building fault-tolerant, distributed applications in large scale networks. Fault-tolerance in Phoenix is achieved using replicated process groups, and consistency within one process group is achieved by using view synchronous communication. The particularity of Phoenix is the provision of this fault-tolerance and consistency in a large scale environment, where large scale is two-fold: (1) the wide geographical distribution of the replicated processes, and (2) a high number of participating processes in the system. The description of Phoenix given here is based on its architecture. Each layer of Phoenix focuses on a particular problem and proposes a solution. Lower layers are responsible for the geographical large scale aspects and their problems, whereas higher layers provide high order communication and deal with numerical large scale aspects. In large scale networks, in addition to the increased unpredictable latency of messages, communication protocols have to deal with link failures, which are often only transient. The dynamic routing layer in the Phoenix architecture tries to mask these link failures by rerouting. This rerouting not only gives increased reliability of communication, but also a more stable and accurate image of the reachability of the processes. On top of the dynamic routing layer, the reliable communication layer provides eventually reliable channels, i.e. messages sent are eventually delivered at the destination provided that the sender and the destination processes are correct. This layer takes into account different parameters of large scale networks, such as (1) increased, unpredictable latency, and (2) non-negligible packet desequencing and (3) important packet loss. The consistency among the replicas is based on a new implementation of the virtually synchronous communication paradigm. The implementation is part of the view synchronous communication layer and is based on a modified consensus protocol together with the eventually reliable channels of the reliable communication layer. The modified consensus protocol itself is based on an unstable suspicion model, where incorrectly suspected processes can be considered alive at a later point. This will be exploited to make the protocol alive whenever a majority of replicas can communicate with each other. The situation where a distributed system is cut into smaller subsystems, and none of these subsystems contains a majority, is not uncommon in large scale, but is often only transient. Further, the dynamic routing layer already does a maximum to avoid this situation. Based on the view synchronous communication layer, the ordered multicast communication layer provides different ordering primitives based on solid, theoretical definitions, allowing the implementation of different total and uniform orders. The numerical large scale is considered by assigning different roles to the processes of a distributed system without leaving the context of groups. The idea is to concentrate the fault-tolerant aspect to a small set of core processes, whilst still guaranteeing convenient and efficient access semantics to processes outside these core processes.}} @phdthesis{Maz96, Address = {Switzerland}, Author = {Mazouni, K. R.}, Date-Modified = {2006-08-01 14:04:58 +0900}, Keywords = {replication, object invocation}, Language = {French}, Number = {1578}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {{\'E}tude de l'invocation entre objets dupliqu{\'e}s dans un syst{\`e}me r{\'e}parti tol{\'e}rant aux fautes}, Url = {http://library.epfl.ch/en/theses/?nr=1578}, Year = {1996}, Abstract = {This dissertation studies the problems to solve in order to use the invocation paradigm to express replicated object communication in fault-tolerant distributed systems. The ultimate goal is to define abstractions which achieve replication encapsulation, ie which give the illusion that replication is an internal property of objects. Thus, object communication could be always expressed using the invocation paradigm, whether objects are replicated or not.}} @article{Maz97, Author = {Mazouni, K. R.}, Date-Modified = {2006-10-25 15:34:13 +0900}, Journal = {L'objet-logiciel, base de donn{\'e}es, r{\'e}seaux}, Keywords = {replication, object replication, object invocation, reliability, distributed systems}, Language = {French}, Number = {2}, Pages = {141--160}, Title = {Encapsulation de la Duplication dans le Service {N2M}}, Volume = {3}, Year = {1997}} @inproceedings{McCann93, Abstract-Url = {http://www.cs.city.ac.uk/bibliography/cs/database?TCU/SARC/1993/14}, Address = {Delft, Netherland}, Author = {McCann, J. A.}, Booktitle = {European Simulation Symposium}, Date-Modified = {2006-03-08 15:24:39 +0900}, Keywords = {Database, Simulation}, Language = {English}, Title = {Simulating Performance of Parallel Database Systems}, Url = {ftp://ftp.cs.city.ac.uk/papers/93/sarc93-14.ps}, Year = {1993}} @article{Minoura82, Author = {Minoura, T.}, Date-Modified = {2006-03-08 19:13:24 +0900}, Issn = {0004-5411}, Keywords = {deadlock}, Language = {English}, Number = {4}, Pages = {1023--1048}, Title = {Deadlock Avoidance Revisited}, Volume = {29}, Year = {1982}, Annote = {The algorithms mentioned are Havender (fixed ordering{\newline} of resources), Modified Havender (dynamical ordering of{\newline} resources), Habermann (do not let a process in{\newline} execution unless all the resources are available), and{\newline} Modfied Habermann (based on more localized resource{\newline} claim). An extended model with multiple resources is{\newline} introduced as well.}} @article{Mohan94, Author = {Mohan, C. and Dievendorff, D.}, Date-Modified = {2006-05-17 16:40:27 +0900}, Journal = {Data Engineering Bulletin}, Keywords = {atomic commitment, MOM}, Language = {English}, Number = {1}, Title = {Recent Work on Distributed Commit Protocols, and Recoverable Messaging and Queuing}, Url = {http://www.almaden.ibm.com/u/mohan/RJ3881.pdf}, Volume = {17}, Year = {1994}} @inproceedings{Mohan99, Address = {Philadelphia, Pennsylvania {USA}}, Author = {Mohan, C.}, Author-Url = {http://www.almaden.ibm.com/u/mohan/}, Booktitle = {Proceedings of International Conference on Management of Data (SIGMOD)}, Date-Modified = {2006-08-23 18:19:09 +0900}, Editor = {Delis, A. and Faloutsos, C. and Ghandeharizadeh, S.}, Isbn = {1-58113-084-8}, Keywords = {groupware}, Language = {English}, Note = {tutorial, paper version in ~\cite{MBWSZ00}}, Pages = {507}, Publisher = {ACM Press}, Title = {A Database Perspective on Lotus Domino/Notes}, Year = {1999}} @inproceedings{NJ99, Address = {Jerusalem}, Author = {Nicolas, M. and Jarke, M.}, Booktitle = {Proceedings of the International Conference on Database Theory ({ICDT})}, Language = {English}, Title = {Increasing the Expressiveness of Analytical Performance Models for Replicated Databases}, Url = {http://www-i5.informatik.rwth-aachen.de/~nicola/ICDT.PS}, Year = {1999}} @inproceedings{NMM99, Address = {Lausanne, Switzerland}, Author = {Narasimhan, P. and Moser, L. and Melliar-Smith, P.}, Booktitle = {Proceedings of the $18^{th}$ {IEEE} Symposium on Reliable Distributed Systems ({SRDS}'99)}, Isbn = {0-7695-0290-3}, Langage = {English}, Pages = {263--273}, Publisher = {IEEE}, Title = {Enforcing Determinism for the Consistent Replication of Multithreaded {CORBA} Applications}, Year = {1999}} @inproceedings{NSB97, Author = {N{\o}rv{\aa}g, K. and Sandst{\aa}, O. and Bratbergsengen, K.}, Booktitle = {ADBIS}, Date-Modified = {2006-08-01 14:02:13 +0900}, Keywords = {concurency control, transaction processing}, Language = {English}, Location = {St. Petersburg (Russia)}, Pages = {9--17}, Title = {Concurrency Control in Distributed Object-Oriented Database Systems}, Url = {http://www.idt.unit.no/IDT/grupper/DB-grp/tech_papers/ADBIS97_dbsim/adbis97.html}, Year = {1997}} @article{Neuman02, Author = {Neumann, P.~G.}, Journal = {ACM Committee on Computers and Public Policy}, Language = {English}, Number = {87}, Title = {The Risk Digest: Forum on Risks to the Public in Computers and Related Systems}, Url = {http://catless.ncl.ac.uk/Risks/21.87.html}, Volume = {21}, Year = {2002}} @inbook{OMG:CCS, Address = {Framingham Corporate Center, 492 Old Connecticut Path, Framingham, {MA} 01701-4568 {USA}}, Chapter = {7 - Concurency Control Server Specifications}, Date-Modified = {2006-05-17 16:40:52 +0900}, Editor = {OMG}, Key = {CCS}, Keywords = {CORBA}, Language = {English}, Pages = {7--1--7--14}, Publisher = {Object Management Group}, Title = {{CORBA}services}, Year = {1996}} @inbook{OMG:CORBA:FT, Address = {250 First Avenue, Suite 201 Needham, MA 02494 {USA}}, Chapter = {Fault-Tolerant CORBA}, Date-Modified = {2006-03-08 19:38:59 +0900}, Key = {OMG}, Keywords = {CORBA, standard}, Language = {English}, Pages = {25--1--25--116}, Publisher = {Object Management Group}, Title = {Common Object Request Broker Architecture ({CORBA}) version 2.5}, Url = {ftp://ftp.omg.org/pub/docs/formal/01-09-29.pdf}, Year = {2001}} @inbook{OMG:OTS, Address = {Framingham Corporate Center, 492 Old Connecticut Path, Framingham, {MA} 01701-4568 {USA}}, Chapter = {10 - Transaction Service Specifications}, Date-Modified = {2006-05-17 16:40:42 +0900}, Editor = {OMG}, Key = {OTS}, Keywords = {CORBA}, Language = {English}, Pages = {327--416}, Publisher = {Object Management Group}, Title = {{CORBA}services}, Year = {1996}} @techreport{ORBOS:FTCORBA, Address = {Framingham Corporate Center, 492 Old Connecticut Path, Framingham, {MA} 01701-4568 {USA}}, Author = {{G}roup, {O}bject {M}anagement}, Date-Modified = {2006-08-01 13:54:05 +0900}, Institution = {Object Management Group}, Keywords = {CORBA, standard}, Language = {English}, Number = {orbos/98-04-01}, Title = {Fault Tolerant {CORBA} using entity redundancy}, Type = {Request for Proposal}, Url = {http://www.omg.org/cgi-bin/doc?orbos/98-04-01.pdf}, Year = {1998}} @article{OV91, Author = {Ozsu, M. T. and Valduriez, P.}, Date-Modified = {2006-04-06 16:09:27 +0900}, Journal = {{IEEE} Computer}, Keywords = {distributed databases,distributed transactions, Multidatabase}, Language = {English}, Number = {8}, Pages = {68--78}, Title = {Distributed Database Systems: Where Are We {N}ow?}, Url = {http://web.cs.ualberta.ca/~database/publications/ozsu/distdb/short.ps}, Volume = {24}, Year = {1991}} @inbook{OV96, Author = {Ozsu, M. T. and Valduriez, P.}, Booktitle = {Handbook of Computer Science and Engineering}, Keywords = {distributed databases, parallel databases}, Language = {English}, Publisher = {CRC Press}, Title = {Distributed and Parallel database Systems}, Url = {http://web.cs.ualberta.ca/~database/publications/ozsu/handbook/handbook.ps}, Year = {1996}} @techreport{Oracle95, Author = {Delmolino, D. J.}, Date-Modified = {2006-04-11 10:36:13 +0900}, Institution = {Oracle Corporation}, Keywords = {database, database replication}, Language = {English}, Release = {7.0}, Title = {Strategies and Techniques for Using {O}racle~7 Replication}, Year = {1995}} @manual{Oracle:AdvRepl98, Address = {500, Oracle Parkway, Redwoord City, CA 94065}, Date-Modified = {2006-03-08 19:24:35 +0900}, Key = {Oracle8}, Keywords = {database, manual}, Language = {English}, Note = {{O}racle Technical White Paper}, Organization = {{O}racle Corporation}, Release = {8.0}, Title = {{O}racle8i\texttrademark Advanced Replication}, Year = {1998}} @inbook{Oracle:SCN8-30, Chapter = {30}, Crossref = {Oracle:SCN8}, Editor = {Corporation, {O}racle}, Language = {English}, Publisher = {{O}racle Corporation}, Title = {{O}racle8i\texttrademark Manual - Database Replication}, Year = {1998}} @techreport{Oracle:SI, Address = {Oracle Corporation, 500 Oracle Parkway, Redwoord City, CA 94065}, Date-Modified = {2006-08-01 14:02:30 +0900}, Institution = {{O}racle Corporation}, Key = {Oracle}, Keywords = {concurency control, commercial}, Language = {English}, Title = {Concurrency Control, Transaction Isolation and Serializability in {SQL92} and {Oracle7}}, Type = {White Paper}, Year = {1995}} @article{PBW+00, Author = {Barrett, P. and Poledna, S. and Burns, A. and Wellings, A.}, Date-Modified = {2006-03-08 19:15:19 +0900}, Journal = {{IEEE} Transactions on Computers}, Keywords = {determinism}, Language = {English}, Number = {2}, Pages = {100--111}, Title = {Replica Determinism and Flexible Scheduling in Hard Real-Time Dependable Systems}, Url = {http://www.computer.org:80/tc/tc2000/t0100abs.htm}, Volume = {49}, Year = {2000}} @article{PCD91, Address = {New York, NY}, Author = {Powell, D. and Ch{\'e}r{\'e}que, M. and Drackley, D.}, Date-Modified = {2006-07-28 18:46:05 +0900}, Journal = {{ACM} Operating Systems Review, {SIGOPS}}, Keywords = {toolkit, fault-tolerance, semi-active}, Language = {English}, Number = {2}, Pages = {122--125}, Publisher = {ACM Press}, Title = {Fault-Tolerance in {D}elta-4*}, Volume = {25}, Year = {1991}} @techreport{PF00, Address = {Palo Alto {CA} 94304 {USA}}, Author = {Pedone, F. and Fr{\o}lund, S.}, Date-Modified = {2005-11-02 15:36:12 +0900}, Institution = {Software Technology Laboratory, Hewlett-Packard Laboratories}, Keywords = {database replication, fault-tolerance, transaction processing, 3-tier}, Language = {English}, Number = {HPL-2000-96}, Title = {{P}ronto: {A} Fast Failover Protocol for Off-the-shelf Commercial Databases}, Url = {http://www.hpl.hp.com/techreports/2000/HPL-2000-96.pdf}, Year = {2000}} @inproceedings{PF00b, Author = {Pedone, F. and Fr{\o}lund, S.}, Booktitle = {Proceedings of ${19}^{th}$ Symposium on Reliable Distributed Systems (SRDS'2000)}, Date-Modified = {2006-08-01 13:30:27 +0900}, Institution = {Hewlett-Packard Laboratories, Palo Alto {CA} 94304 {USA}}, Isbn = {0-7695-0543-0}, Keywords = {database replication, atomic broadcast}, Language = {English}, Location = {N{\"u}rnberg, Germany}, Organization = {IEEE}, Title = {{P}ronto: {A} fast Failover Protocol for Off-the-shelf Commercial Databases}, Url = {http://www.hpl.hp.com/techreports/2000/HPL-2000-96.pdf}, Year = {2000}, Abstract = {Enterprise applications typically store their state in databases. If a database fails, the application is unavailable while the database recovers. Database recovery is time consuming because it involves replaying the persistent transaction log. To isolate end users from database failures, we introduce Pronto, a protocol to orchestrate the transaction processing by multiple, standard databases so that they collectively implement the illusion of a single, highly available database. The key challenge in implementing this illusion is to enable fast failover from one database to another so that database failures do not interrupt the transaction processing. We solve this problem with a novel replication protocol that handles non-determinism without relying on perfect failure detection}, Annote = {short version of \cite{PF00}}} @inproceedings{PG97, Author = {Pedone, F. and Guerraoui, R.}, Booktitle = {Proceedings of the Pacific Rim International Symposium on Fault-Tolerant Systems ({PRFTS}'97) Pacific Rim International Symposium on Fault-Tolerant Systems ({PRFTS}'97)}, Date-Modified = {2006-08-01 13:33:43 +0900}, Isbn = {0-8186-8212-4}, Keywords = {transaction processing}, Language = {English}, Organization = {IEEE}, Pages = {104--110}, Title = {On Transaction Liveness in Replicated Databases}, Url = {http://doi.ieeecomputersociety.org/10.1109/PRFTS.1997.640133}, Year = {1997}, Abstract = {This paper makes a first attempt to give a precise characterization of liveness in replicated database systems. We introduce the notion of liveness degrees, which express the expectation a database user might have about the termination of transactions, despite concurrency and failures. Our liveness degrees are complementary to the traditional transactional safety degrees (e.g., serializability) and lead to a better characterization of the reliability of replication protocols. We present a generic framework that abstracts several well-known replication protocols and we point out an interesting trade-off between liveness and safety properties in these protocols.}} @article{PGM89, Author = {Pittelli, F. and Garcia-Molina, H.}, Journal = {ACM Transaction on Computer Systems}, Language = {English}, Number = {1}, Pages = {25--60}, Title = {Reliable Scheduling in a {TMR} Database System}, Volume = {7}, Year = {1989}} @article{PGM94, Abstract-Url = {http://www.acm.org/pubs/citations/journals/tods/1994-19-3/p423-polyzois/#abstract}, Affiliation = {IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA}, Author = {Polyzois, C. A. and Garc{\'\i}a-Molina, H.}, Issn = {0362-5915}, Keywords = {database, algorithms; performance; reliability}, Language = {English}, Number = {3}, Pages = {423--449}, Title = {Evaluation of Remote Backup Algorithms for Transaction-Processing Systems}, Url = {http://www.acm.org/pubs/articles/journals/tods/1994-19-3/p423-polyzois/p423-polyzois.pdf}, Volume = {19}, Year = {1994}} @inproceedings{PGS97, Address = {Durham, North Carolina, USA}, Author = {Pedone, F. and Guerraoui, R. and Schiper, A.}, Booktitle = {Proceedings of the $16^{th}$ Symposium on Reliable Distributed Systems (SRDS-16)}, Date-Modified = {2006-05-17 16:26:39 +0900}, Isbn = {0-8186-8177-2}, Keywords = {transaction processing, Database replication}, Language = {English}, Pages = {175--182}, Title = {Transaction Reordering in Replicated Databases}, Url = {http://www.inf.unisi.ch/faculty/pedone/papers/1997SRDS.pdf}, Year = {1997}, Abstract = {The paper presents a fault tolerant lazy replication protocol that ensures 1-copy serializability at a relatively low cost. Unlike eager replication approaches, our protocol enables local transaction execution and does not lead to any deadlock situation. Compared to previous lazy replication approaches, we significantly reduce the abort rate of transactions and we do not require any reconciliation procedure. Our protocol first executes transactions locally, then broadcasts a transaction certification message to all replica managers, and finally employs a certification procedure to ensure 1-copy serializability. Certification messages are broadcast using a non blocking atomic broadcast primitive, which alleviates the need for a more expensive non blocking atomic commitment algorithm. The certification procedure uses a reordering technique to reduce the probability of transaction aborts}} @inproceedings{PGS98, Abstract-Url = {http://lsewww.epfl.ch/Documents/abstract/PGS98.txt}, Alternate-Key = {Pedone98b}, Author = {Pedone, F. and Guerraoui, R. and Schiper, A.}, Booktitle = {Proceedings of EuroPar ({EuroPar}'98)}, Date-Modified = {2006-05-17 17:04:31 +0900}, Keywords = {Dragon, database replication, group communication}, Language = {English}, Title = {Exploiting Atomic Broadcast in Replicated Databases}, Url = {http://citeseer.nj.nec.com/pedone98exploiting.html}, Year = {1998}} @techreport{PGS99, Address = {Switzerland}, Author = {Pedone, F. and Guerraoui, R. and Schiper, A.}, Date-Modified = {2006-05-01 17:33:36 +0900}, Institution = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Keywords = {database replication}, Number = {SSC/1999/008}, Title = {The Database State Machine Approach}, Url = {http://lsewww.epfl.ch/Documents/postscript/PGS99.ps}, Year = {1999}} @inproceedings{PJA01, Author = {Pati{\~n}o-Mart{\`\i}nez, M. and Jim{\'e}nez-Peris, R. and Ar{\'e}valo, S.}, Booktitle = {Workshop on Concurrency in Dependable Computing}, Date-Modified = {2006-08-01 14:13:07 +0900}, Keywords = {group communication, transaction processing}, Language = {English}, Location = {Newcastle Upon Tyne, United Kingdom}, Title = {Group Transactions: An Integrated Approach to Transactions and Group Communication}, Url = {http://lml.ls.fi.upm.es/~rjimenez/papers/2001/grouptrans.pdf}, Year = {2001}} @inproceedings{PJKA00, Address = {Toledo, Spain}, Author = {Patin{\~{n}}o-Mart\'{\i}nez, M. and Jim\'{e}nez-Paris, R. and Kemme, B. and Alonso, G.}, Booktitle = {Proceedings of hte $14^{th}$ Internationnal conference on Distributed Computing (DISC 2000)}, Editor = {Herlihy, M.}, Isbn = {3-540-41143-7}, Key = {PJKA00}, Language = {English}, Pages = {315--329}, Publisher = {Springer Verlag, Berlin}, Series = {lecture notes in computer science}, Title = {Scalable Replication in Database Clusters}, Volume = {1914}, Year = {2000}} @inproceedings{PMS99, Address = {Edinburgh - Scotland - UK}, Author = {Pacitti, E. and Minet, P. and Simon, E.}, Booktitle = {Proceedings of the 25th International Conference on Very Large Databases}, Language = {English}, Month = {7--10}, Title = {Fast Algorithms for Maintaining Replica Consistency in Lazy Master Replicated Databases}, Year = {1999}} @techreport{POET99, Institution = {{POET} Software}, Key = {Poet}, Language = {English}, Number = {OSS-FOWHIT-27JUL99}, Title = {{POET} FastObject Technology}, Type = {Technical Presentation}, Year = {1999}} @techreport{POET:PSS, Author = {{INPRISE} and {O}bjectivity and {S}ecant and {S}un}, Date-Modified = {2006-05-17 16:38:46 +0900}, Institution = {OMG}, Keywords = {service}, Language = {English}, Note = {with collaboration and support from {POET} and others}, Number = {orbos/98-08-02}, Title = {Persistent State Service 2.0}, Type = {Revised Joint Submission}, Year = {1998}} @inproceedings{PRW01, Address = {Bologna, Italy}, Author = {Pasin, M. and Riveill, M. and Weber, T.~S.}, Booktitle = {Proceedings of the European Research Seminar on Advances in Distributed Systems (ERSADS2001)}, Language = {English}, Title = {High-Available Entreprise {JavaBeans} Using Group Communication System Support}, Url = {http://www.cs.unibo.it/ersads/papers/pasin.pdf}, Year = {2001}} @inproceedings{PS98, Abstract-Url = {http://lsewww.epfl.ch/Documents/abstract/PS98.txt}, Alternate-Key = {Pedone98}, Author = {Pedone, F. and Schiper, A.}, Booktitle = {Proceedings of the $12^{th}$ International Symposium on Distributed Computing ({DISC}'98, formerly {WDAG})}, Date-Modified = {2006-05-01 17:32:03 +0900}, Keywords = {Database replication}, Language = {English}, Title = {Optimistic Atomic Broadcast}, Url = {http://www.springer.de/comp/lncs/index.html}, Year = {1998}} @inproceedings{PS99c, Author = {Pedone, F. and Schiper, A.}, Booktitle = {Proceedings of the $13^{th}$ International Symposium on Distributed Computing (DISC'99, formerly WDAG)}, Date-Modified = {2006-05-01 17:32:46 +0900}, Keywords = {broadcast}, Language = {English}, Title = {Generic Broadcast}, Url = {http://lsewww.epfl.ch/Documents/acrobat/PS99c.pdf}, Year = {1999}} @inproceedings{PSTT96, Address = {Connemara, Ireland}, Author = {Petersen, K. and Spreitzer, M. and Terry, D. and Theimer, M.}, Booktitle = {$7^{th}$ ACM SIGOPS European Workshop}, Institution = {Xerox PARC}, Keywords = {Bayou}, Language = {English}, Title = {{B}ayou: Replicated Database Services for World-Wide Applications}, Url = {http://www.parc.xerox.com/csl/projects/bayou/}, Year = {1996}} @techreport{PSWL94, Author = {Parrington, G.~D. and Shrivastava, S.~K. and Wheater, S.~M. and Little, M.~C.}, Institution = {ESPRIT Basic Research Project BROADCAST}, Language = {English}, Number = {TR94-65}, Pages = {39}, Title = {The Design and Implementation of {A}rjuna}, Url = {http://www.research.ec.org/broadcast/trs/papers/65.ps}, Year = {1994}} @inproceedings{PU88, Address = {New York}, Author = {Pu, C.}, Booktitle = {Proceedings Fourth International Conference on Data Engineering}, Date-Modified = {2005-11-02 15:25:46 +0900}, Keywords = {federated database}, Language = {English}, Pages = {548--555}, Publisher = {IEEE}, Title = {Superdatabases for Composition of Heterogeneous Databases}, Year = {1988}, Annote = {The proposal is to nest heterogeneous systems in a{\newline} tree like structure with each node containing a global{\newline} schema for itself, and all its child nodes. Thus the{\newline} schema itself is distributed. The main interest seems{\newline} to being how this can aid in concurrency control.}} @inproceedings{PW90, Author = {Prusker, F.~J. and Wobber, E.~P.}, Booktitle = {Proceedings of the IEEE Workshop on the Management of Replicated Data}, Keywords = {lazy replication}, Language = {English}, Organization = {IEEE}, Title = {The Siphon: Managing Distant Replicated Repositories}, Url = {ftp://ftp.digital.com/pub/DEC/PRL/research-reports/PRL-RR-7.ps.Z}, Year = {1990}} @phdthesis{Panagos96, Abstract-Url = {ftp://cs-ftp.bu.edu/techreports/abstracts/96-010}, Author = {Panagos, E.}, Keywords = {distributed transactions}, Language = {English}, School = {Boston University}, Title = {Client Based Logging: {A} New Paradigm for Distributed Transaction Management}, Url = {ftp://cs-ftp.bu.edu/techreports/96-010-client-based-logging.ps.Z}, Year = {1996}} @book{Papadimitriou86, Author = {Papadimitriou, C.}, Language = {English}, Publisher = {Computer Science Press}, Title = {The Theory of Database Concurrency Control}, Year = {1986}} @phdthesis{Ped99, Abstract-Url = {http://lsewww.epfl.ch/Documents/abstract/Ped99.txt}, Address = {Switzerland}, Author = {Pedone, F.}, Keywords = {Dragon}, Language = {English}, Number = {2090}, School = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Title = {The Database State Machine and Group Communication Issues}, Url = {http://lsewww.epfl.ch/Documents/acrobat/Ped99.pdf}, Year = {1999}} @article{Pedone01, Abstract-Url = {http://www.computer.org/computer/co2001/rz080abs.htm}, Author = {Pedone, F.}, Date-Modified = {2006-03-08 19:08:30 +0900}, Journal = {Computer}, Keywords = {distributed computing, optimistic technique}, Language = {English}, Number = {12}, Pages = {80--86}, Title = {Boosting System Performance with Optimistic Distributed Protocols}, Url = {http://dlib2.computer.org/co/books/co2001/pdf/rz080.pdf}, Volume = {34}, Year = {2001}} @inproceedings{Pedone97, Author = {Pedone, F.}, Booktitle = {Proceedings of the $2^{nd}$ European Research Seminar on Advances in Distributed Systems ({ERSADS}'97)}, Date-Modified = {2006-08-01 13:15:48 +0900}, Keywords = {database replication, optimistic technique}, Language = {English}, Location = {Zinal, Valais, Switzerland}, Pages = {273--278}, Title = {A Closer Look at Optimistic Replica Control}, Year = {1997}} @techreport{Plank97, Address = {Knoxvill, TN 27996 {USA}}, Alt-Url = {ftp://cs.utk.edu/pub/TechReports/1997/ut-cs-97-372.ps.Z}, Author = {Plank, J.~S.}, Author-Url = {http://www.cs.utk.edu/~plank/}, Date-Modified = {2006-05-17 16:43:40 +0900}, Institution = {Departement of Computer Science, University of Tennesse}, Keywords = {checkpointing}, Language = {English}, Number = {UT-CS-97-372}, Title = {An Overview of Checkpointing in Uniprocessor and Distributed Systems Focusing on Implementation and Performance}, Url = {http://citeseer.nj.nec.com/plank97overview.html}, Year = {1997}} @book{Pol95, Author = {Poledna, S.}, Date-Modified = {2006-09-15 15:27:53 +0900}, Isbn = {0-7923-9657-X}, Language = {English}, Publisher = {Kluwer}, Series = {Enginering and Computer Science}, Title = {Fault-Tolerant Real-Time Systems: the Problem of Replica Determinism}, Url = {http://www.wkap.nl/prod/b/0-7923-9657-X}, Volume = {345}, Year = {1995}} @techreport{Pu90, Address = {New York, {NY} 10027 {USA}}, Author = {Pu, C.}, Date-Modified = {2005-11-02 15:25:54 +0900}, Institution = {Departement of Computer Science, Columbia University}, Keywords = {federated database}, Language = {English}, Number = {TR CUCS-24}, Title = {The Superdatabase Architecture: Cooperative Heterogeneous Transactions}, Url = {http://www.cse.ogi.edu/~lingliu/courses/cse515/superdb.ps}, Year = {1990}} @inproceedings{Pu92, Address = {Boulder {USA}}, Author = {Pu, C.}, Booktitle = {Proceedings of $3^rd$ Workshop on Heterogeneous Databases and Semantic Interoperability}, Date-Modified = {2006-08-01 14:01:15 +0900}, Institution = {Departement of Computer Science, Columbia University, New York {USA}}, Keywords = {transaction processing, serializability}, Language = {English}, Organization = {NSF}, Title = {Asynchronous Transaction Processing with Epsilon-Serialisabilty}, Url = {http://citeseer.nj.nec.com/340885.html}, Year = {1992}} @inproceedings{RB91, Author = {Ricciardi, A. M. and Birman, K. P.}, Booktitle = {Proc. of the 10$^{th}$ Symp. on Principles of Distributed Computing {PODC}}, Date-Modified = {2006-07-15 12:00:19 +0900}, Isbn = {0-89791-439-2}, Language = {English}, Location = {Montreal, Quebec, Canada}, Month = {August}, Pages = {341--353}, Title = {Using process groups to implement failure detection in asynchronous environment}, Url = {http://www.acm.org/pubs/articles/proceedings/podc/112600/p341-ricciardi/p341-ricciardi.pdf}, Year = {1991}} @techreport{RB95, Author = {{van Renesse}, R. and Birman, K. P.}, Institution = {Cornell University, Computer Science Department}, Language = {English}, Month = {29,}, Number = {TR95-1505}, Pages = {11}, Title = {Protocol Composition in {Horus}}, Type = {Technical Report}, Year = {1995}} @article{RBM96, Alt-Key = {Horus:RBM96}, Author = {{van Renesse}, R. and Birman, K. P. and Maffeis, S.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Issn = {0001-0782}, Journal = {Communications of the {ACM}}, Keywords = {distributed systems, group communication}, Language = {English}, Number = {4}, Pages = {76--83}, Title = {{Horus}: {A} Flexible Group Communication System}, Url = {http://www.acm.org/pubs/articles/journals/cacm/1996-39-4/p76-van_renesse/p76-van_renesse.pdf}, Volume = {39}, Year = {1996}} @inproceedings{RFV96, Address = {Hong Kong}, Author = {Rodrigues, L. and Fonseca, H. and Ver{\'\i}ssimo, P.}, Booktitle = {Proceedings of the $16^{th}$ International Conference on Distributed Computing Systems ({ICDCS}'96)}, Date-Modified = {2005-09-19 14:44:38 +0900}, Isbn = {0-8186-7398-2}, Language = {English}, Pages = {503--510}, Publisher = {IEEE}, Title = {Totally Ordered Multicast in Large-Scale Systems}, Url = {http://citeseer.nj.nec.com/rodrigues96totally.html}, Year = {1996}} @inproceedings{RR00, Author = {Rodrigues, L. and Raynal, M.}, Booktitle = {Proceedings of the $20^{th}$ International Conference on Distributed Systems ({ICDCS'2000})}, Date-Modified = {2006-08-01 13:17:36 +0900}, Isbn = {0-7695-0601-1}, Keywords = {atomic broadcast, crash recovery}, Language = {English}, Location = {Taipei, Taiwan (ROC)}, Organization = {IEEE}, Pages = {288--295}, Title = {Atomic Broadcast in Asynchronous Crash-Recovery Distributed Systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/ICDCS.2000.840941}, Year = {2000}} @article{RST95, Author = {Rangaranjan, S. and Setia, S. and Tripathi, S. K.}, Date-Modified = {2006-03-08 19:13:12 +0900}, Journal = {{IEEE} Transactions on Parallel and Distributed Systems}, Keywords = {replication}, Language = {English}, Number = {12}, Pages = {1271--1282}, Title = {A Fault-Tolerant Algorithm for Replicated Data Management}, Volume = {6}, Year = {1995}} @techreport{RV93, Address = {Department of Computing Science, University of Newcastle, Newcastle upon Tyne, NE1 7RU, UK}, Author = {Rodrigues, L. and Ver\'{i}ssimo, P.}, Institution = {Broadcast Technical Report}, Number = {93-16}, Title = {The {ROMANCE} approach to Replicated Object Management}, Url = {http://www.research.ec.org/broadcast/trs/papers/16.ps}, Year = {1993}} @article{Rahm93, Affiliation = {Dept. of Comput. Sci., Kaiserslautern Univ., Germany}, Author = {Rahm, E.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Keywords = {database, concurency control, distributed systems, simulation}, Language = {English}, Number = {2}, Pages = {333--337}, Title = {Empirical Performance Evaluation of Concurency and Coherency Control Protocols for Database Sharing Systems}, Url = {http://www.acm.org/pubs/citations/journals/tods/1993-18-2/p333-rahm/}, Volume = {18}, Year = {1993}} @inproceedings{RTA97, Author = {Raynal, M. and Thia-Kime, G. and Ahamad, M.}, Booktitle = {Proceedings of the 23rd Conference '97 New Frontiers of Information Technology}, Date-Modified = {2006-08-01 14:11:15 +0900}, Keywords = {serializability}, Language = {English}, Organization = {EUROMICRO}, Title = {From Serializable to Causal Transactions for Collaborative Applications}, Url = {http://doi.ieeecomputersociety.org/10.1109/EURMIC.1997.617301}, Year = {1997}, Abstract = {Services in decentralized distributed systems can be implemented using shared distributed objects. When these objects are accessed concurrently, serializability (the traditional consistency criterion) can be used to define their execution behaviour. However, this consistency criterion has a major drawback because it imposes strong synchronization constraints on the execution of applications which cannot be met efficiently in decentralized systems. In this paper, we examine weaker consistency criteria for computations in which accesses to shared objects are grouped to form transactions. The guarantees provided by transactions (e.g. concurrency and failure atomicity) make them attractive when computations manipulate the state of long-lived objects. We explore two new criteria: causal consistency and causal serializability. These criteria turn out to be sufficient for a class of applications (e.g. collaborative applications) and their implementation results in lesser synchronization and hence improved autonomy, availability and performance. These criteria are formally defined and protocols implementing them are presented.}} @article{Raynal98, Author = {Raynal, M.}, Date-Modified = {2006-03-08 19:14:08 +0900}, Journal = {Bulletin of the Technical Commitee on Data Engineering}, Keywords = {consensus, replication}, Language = {English}, Number = {4}, Pages = {30--37}, Title = {Consensus Based Management of Distributed and Replicated Data}, Volume = {21}, Year = {1998}} @article{Reed83, Author = {Reed, D. P.}, Date-Modified = {2006-03-08 19:16:01 +0900}, Key = {Reed}, Keywords = {atomic actions}, Language = {English}, Number = {1}, Pages = {3--23}, Title = {Implementing Atomic Actions on Decentralized Data}, Volume = {1}, Year = {1983}} @mastersthesis{Riedweg99, Abstract-Url = {http://www.inf.ethz.ch/department/IS/iks/diplomaworks/bettinasem3.html}, Author = {Riedweg, G.}, Keywords = {Dragon, Database Replication}, Language = {German}, School = {Eidgen{\"o}ssische Technische Hochschule Z{\"u}rich}, Title = {Entwicklung eines Replikationsmanager}, Type = {Semester Project}, Year = {1999}} @inproceedings{SAA97, Address = {Santa Barbara, California {USA}}, Author = {Stanoi, I. and Agrawal, D. and Abbadi, A. El}, Booktitle = {Proceedings of the $16^{th}$ Annual Symposium on Principles of Distributed Computing}, Language = {English}, Organization = {ACM}, Pages = {283}, Title = {Using Broadcast Primitives in Replicated Databases (Abstract)}, Year = {1997}} @inproceedings{SAA98, Address = {Amsterdam, The Netherlands}, Author = {Stanoi, I. and Agrawal, D. and Abbadi, A. El}, Booktitle = {Proceedings of the $18^{th}$ {IEEE} International Conference on Distributed Computing Systems {ICDCS}'98}, Language = {English}, Organization = {IEEE}, Pages = {148--155}, Title = {Using Broadcast Primitives in Replicated Databases}, Url = {http://www.cs.ucsb.edu/~ioana/icdcs98.ps}, Year = {1998}} @article{SBCM95, Author = {Samaras, G. and Britton, K. and Citron, A. and Mohan, C.}, Date-Modified = {2006-03-08 19:12:00 +0900}, Journal = {Distributed and Parallel Databases}, Keywords = {database, atomic commitment, 2PC}, Language = {English}, Number = {4}, Pages = {325--360}, Title = {Two-Phase Commit Optimizations in a Commercial Distributed Environment}, Volume = {3}, Year = {1995}} @article{SC93, Author = {Stamos, James W. and Cristian, Flaviu}, Date-Modified = {2006-03-08 19:14:26 +0900}, Journal = {Distributed and Parallel Databases}, Keywords = {transaction processing}, Language = {English}, Number = {4}, Pages = {383--408}, Title = {Coordinator Log Transaction Execution Protocol}, Url = {ftp://ftp.cs.ucsd.edu/pub/team/coordlog.ps.Z}, Volume = {1}, Year = {1993}} @inproceedings{SFKL+98, Address = {Chicago IL USA}, Author = {Stelling, P. and Foster, I. and Kesselman, C. and Lee, C. and von Laszewski, G.}, Booktitle = {Proceedings of the $7^{th}$ Symposium on High Performance Distributed Computing}, Date-Modified = {2006-04-11 10:24:52 +0900}, Isbn = {0-8186-8579-4}, Keywords = {failure detection}, Language = {English}, Organization = {IEEE}, Pages = {268--278}, Title = {A fault detection service for wide area distributed computations}, Url = {http://citeseer.ist.psu.edu/stelling98fault.html}, Year = {1998}} @techreport{SHORE94, Address = {Minneapolis, MN}, Author = {Carey, M. and DeWitt, D. and Naughton, J. and Solomon, M.}, Booktitle = {Proc. of the 1994 ACM SIGMOD Conference}, Date-Modified = {2006-10-16 18:26:43 +0900}, Institution = {University of Wisconsin Madison Computer Science Departement}, Keywords = {toolkit}, Language = {English}, Title = {Shoring Up Persistent Applications}, Year = {1994}} @inproceedings{SL00, Address = {Toledo, Spain}, Author = {Saito, Y. and Levy, M.}, Book-Id = {LNCS 1914}, Booktitle = {Proceedings of the $14^{th}$ International Conference on Distributed Computing {DISC'2000}}, Isbn = {3-540-41143-7}, Language = {English}, Publisher = {Springer-Verlag Heidelberg, Tiergartenstr, D-69121 Heidelberg}, Slides-Url = {http://128.95.4.112/homes/yasushi/disctalk.ppt}, Title = {Optimistic replication for Internet data services}, Url = {http://128.95.4.112/homes/yasushi/disc.ps.gz}, Year = {2000}} @techreport{SLKS92, Author = {Soparkar, N. and Levy, E. and Korth, H. F. and Silberschatz, A.}, Date-Modified = {2006-08-01 13:59:05 +0900}, Institution = {University of Texas, Austin {USA}}, Keywords = {atomic commitment}, Language = {English}, Number = {CS-TR-92-15}, Title = {Adaptive Commitment for Real-Time Distributed Transactions}, Type = {Technical Report}, Url = {ftp://ftp.cs.utexas.edu/pub/techreports/tr92-15.ps.Z}, Year = {1992}} @inproceedings{SMDN93, Author = {Savnik, I. and Mohoric, T. and Dolenc, T. and Novak, F.}, Booktitle = {Proceedings of the $7^{th}$ Annual European Computer Conference, COMPEURO 93}, Keywords = {Functional database models, Design databases}, Language = {English}, Pages = {239--247}, Publisher = {IEEE Computer Society Press}, Scope = {Database models}, Title = {Database model for design data}, Url = {ftp://martin.ijs.si/pub/CSD/Reports/CSD-TR-93-2.ps.gz}, Year = {1993}} @article{SR96, Author = {Schiper, A. and Raynal, M.}, Date-Modified = {2006-05-17 16:48:06 +0900}, Issn = {0001-0782}, Journal = {Communications of the ACM}, Keywords = {group communication, distributed systems, transaction processing}, Language = {English}, Number = {4}, Pages = {84--87}, Title = {From Group Communication to Transactions in Distributed Systems}, Url = {http://doi.acm.org/10.1145/227210.227230}, Volume = {39}, Year = {1996}, Abstract = {Because toolkits for developing process groups do not allow applications to issue reliable multicasts to multiple groups, a new development model distinguishing between groups as logical addressing mechanisms and reliable communication primitives is needed to create reliable distributed applications. }} @techreport{SS91, Author = {Shah, A. and Sturman, D.}, Date-Modified = {2006-05-17 16:41:08 +0900}, Institution = {Cornell University, Computer Science Department}, Keywords = {simulation, distributed databases}, Language = {English}, Number = {TR90-1175}, Pages = {7}, Title = {A Simulator for Exploring Replication and Locality of Access in a Distributed Database}, Year = {1991}} @inproceedings{SS93, Address = {Pittsburgh, Pennsylvania, {USA}}, Author = {Schiper, A. and Sandoz, A.}, Booktitle = {Proceedings of the $13^{th}$ International Conference on Distributed Computing Systems ({ICDCS}-13)}, Date-Modified = {2006-05-17 16:29:57 +0900}, Isbn = {0-8186-3770-6}, Keywords = {reliable broadcast, Synchrony}, Language = {English}, Month = {May}, Pages = {561--568}, Publisher = {{IEEE} Computer Society Press}, Title = {Uniform Reliable Multicast in a Virtually Synchronous Environment}, Url = {http://historical.ncstrl.org/tr/ps/cabernet/TR93-12.ps}, Year = {1993}, Abstract = {The authors present the definition of and solution to the uniform reliable multicast problem in the virtually synchronous environment defined by the Isis system. A uniform reliable multicast of a message m has the property that if m has been received by any destination process (faulty or not), then m is received by all processes that reach a decision. Uniform reliable multicast provides a solution to the distributed commit problem. Two multicast primitives are defined in the virtually synchronous model: reliable multicast (called view-atomic) and uniform reliable multicast (called uniform view-atomic). The view-atomic multicast is used to implement the uniform view-atomic primitive. As view-atomicity is based on the concept of process group membership, a connection is established between the process group membership and the distributed commit problems}} @inproceedings{SW99, Address = {K{\"{u}}hlungsborn, Germany}, Author = {Schenkel, R. and Weikum, G.}, Booktitle = {Proceedings of the 2nd Workshop {EFIS}'99}, Date-Modified = {2006-05-17 16:35:33 +0900}, Isbn = {3-89601-013-1}, Keywords = {federated database, CORBA, transaction processing}, Language = {English}, Pages = {79--94}, Publisher = {Infix, Sankt Augustin}, Title = {Experiences with Building a Federated Transaction Manager based on {CORBA} {OTS}}, Url = {http://www-dbs.cs.uni-sb.de/public_html/papers/efis99_final.ps.gz}, Year = {1999}} @article{SWZ98, Alt-Url = {http://citeseer.nj.nec.com/102955.html}, Author = {Scheuermann, P. and Weikum, G. and Zabback, P.}, Issn = {1066-8888}, Journal = {VLDB Journal: Very Large Data Bases}, Language = {English}, Number = {1}, Pages = {48--66}, Publisher = {Springer Verlag}, Title = {Data Partitioning and Load Balancing in Parallel Disk Systems}, Url = {http://link.springer.de/link/service/journals/00778/papers/8007001/80070048.pdf}, Volume = {7}, Year = {1998}} @mastersthesis{Saito00, Address = {Seattle, WA 98195 {USA}}, Author = {Saito, Y.}, Language = {English}, School = {University of Washington}, Title = {Optimistic Replication Algorithms}, Type = {General Exam Report}, Url = {http://128.95.4.112/homes/yasushi/general.ps}, Year = {2000}, Annote = {What is a General Exam Report?}} @article{Schneider90, Author = {Schneider, F.~B.}, Date-Modified = {2006-08-01 14:06:26 +0900}, Issn = {0360-0300}, Journal = {{ACM} Computing Surveys}, Keywords = {distributed systems}, Language = {English}, Number = {4}, Pages = {299--319}, Title = {Implementing Fault-Tolerant Services Using the State Machine Approach: {A} Tutorial}, Url = {http://www.acm.org/pubs/articles/journals/surveys/1990-22-4/p299-schneider/p299-schneider.pdf}, Volume = {22}, Year = {1990}} @inproceedings{Skeen81, Address = {Ann Arbor, Michigan {USA}}, Author = {Skeen, D.}, Booktitle = {Proceedings of the 1981 International Conference on Management of Data}, Editor = {Lien, Y. Edmund}, Language = {English}, Pages = {133--142}, Publisher = {{ACM} {SIGMOD}, New York}, Title = {Nonblocking Commit Protocols}, Year = {1981}} @article{Stacey94, Anote = {Explains commercial databases are lazy}, Author = {Stacey, D.}, Date-Modified = {2006-03-08 19:16:13 +0900}, Journal = {Database Programming \& Design}, Keywords = {database, replication}, Language = {English}, Number = {12}, Title = {Replication: {DB2}, {Oracle}, or {Sybase}}, Volume = {7}, Year = {1994}} @article{Stacey95, Author = {Stacey, D.}, Date-Modified = {2005-11-02 15:35:16 +0900}, Issn = {0163-5808}, Journal = {{SIGMOD} Record ({ACM} Special Interest Group on Management of Data)}, Keywords = {database replication}, Language = {English}, Number = {4}, Pages = {95--101}, Title = {Replication: {DB2}, {O}racle, or {S}ybase}, Volume = {24}, Year = {1995}, Abstract = {Is replication salvation or the devil in disguise?{\newline} Here's what three implementation tell us.}} @article{Stonebraker79, Author = {Stonebraker, M.}, Date-Modified = {2006-03-08 19:12:24 +0900}, Journal = {{IEEE} Transactions on Software Engineering}, Keywords = {concurency control, database}, Language = {English}, Pages = {188--194}, Title = {Concurrency Control and Consistency of Multiple Copies of Data in Distributed {\sc {I}ngres}}, Volume = {SE-5}, Year = {1979}} @book{Stonebraker88, Address = {Los Altos, CA 94022, {USA}}, Author = {Stonebraker, M.}, Date-Modified = {2006-05-17 16:43:23 +0900}, Isbn = {0-934613-65-6}, Keywords = {database}, Language = {English}, Pages = {xii + 644}, Publisher = {Morgan Kaufmann Publishers}, Title = {Readings in Database Systems}, Year = {1988}} @manual{Sybase98, Address = {Sybase, Inc. 6475 Christie Avenue Emeryville, CA 94608 {USA}}, Language = {English}, Note = {CM No. 9809-9030}, Title = {High Availability through Warm-Standby Support in Sybase Replication Server}, Type = {White Paper}, Url = {http://www.sybase.com/content/1003078/warm_standby_wp.pdf}, Year = {1998}} @book{TOV99, Address = {Upper Saddle River, New Jersey 07458 {USA}}, Author = {Tamer~{\"{O}}zsu, M. and Valdurez, P.}, Edition = {Second}, Isbn = {0-13-659707-6}, Language = {English}, Publisher = {Prentice Hall}, Title = {Principles of Distributed Database Systems}, Year = {1999}} @inproceedings{TP98, Author = {Theel, O. and Pagnia, H.}, Booktitle = {Proc. of the Int. Symp. on Fault-Tolerant Computing {FTCS}}, Language = {English}, Title = {Optimal Replica Control Protocols Exhibit Symmetric Operation Availabilities}, Year = {1998}} @article{TPST98, Affiliation = {Computer Science Laboratory Xerox Palo Alto Research Laboratory Palo Alto, CA 94304 {USA}}, Author = {Terry, D. B. and Petersen, K. and Spreizer, M. J. and Theimer, M. M.}, Journal = {Bulletin of the Technical Commitee on Data Engineering}, Language = {English}, Number = {21}, Organization = {{IEEE} Computer Society}, Pages = {12--20}, Title = {The Case for Non-Transparent Replication: Example from {B}ayou}, Volume = {4}, Year = {1998}} @inproceedings{TTPDSH95, Address = {Copper Mountain, CO {USA}}, Alt-Url = {http://www.parc.xerox.com/csl/projects/bayou/pubs/sosp-95/BayouConflictsSOSPPreprint.ps.gz}, Author = {Terry, D.~B. and Theimer, M.~M. and Petersen, K. and Demers, A.~J. and Spreitzer, M.~J. and Hauser, C.~H.}, Booktitle = {Proceedings of the $15^{th}$ Symposium on Operating Systems Principles ({SOSP-15})}, Date-Modified = {2006-03-08 15:26:36 +0900}, Institution = {Computer Science Laboratory, Xerox Palo Alto Research Center}, Keywords = {Bayou, epidemic algorithm}, Language = {English}, Pages = {172--182}, Project-Url = {http://www.parc.xerox.com/csl/projects/bayou/}, Title = {Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System}, Url = {http://www.acm.org/pubs/articles/proceedings/ops/224056/p184-talluri/p184-talluri.pdf}, Year = {1995}} @article{Than00, Author = {Thanish, P.}, Date-Modified = {2006-08-01 14:01:35 +0900}, Journal = {{IEEE} Concurrency}, Keywords = {atomic commitment}, Language = {English}, Pages = {34--41}, Title = {Atomic Commit in Concurrent Programming}, Year = {2000}} @article{Thomas79, Author = {Thomas, R. H.}, Date-Modified = {2006-03-08 15:21:37 +0900}, Issn = {0362-5915}, Keywords = {database, consensus}, Language = {English}, Number = {2}, Pages = {180--209}, Title = {A Majority Consensus Approach to Concurrency Control for Multiple Copy Databases}, Volume = {4}, Year = {1979}} @article{Thomasian98, Author = {Thomasian, A.}, Date-Modified = {2006-05-17 17:03:16 +0900}, Journal = {{IEEE} Transactions on Knowledge and Data Enginiering}, Keywords = {Database, modeling, concurency control}, Language = {English}, Number = {1}, Pages = {173--189}, Title = {Distributed Optimistic Concurency Control Methods for High-Performance Transaction Processing}, Volume = {10}, Year = {1998}} @article{Thomasian98b, Author = {Thomasian, A.}, Issn = {0360-0300}, Journal = {{ACM} Computing Surveys}, Keywords = {algorithms; performance}, Language = {English}, Number = {1}, Pages = {70--119}, Subject = {{\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Modeling and prediction. {\bf H.2.4} Information Systems, DATABASE MANAGEMENT, Systems, Transaction processing.}, Title = {Concurrency control: methods, performance, and analysis}, Url = {http://www.acm.org:80/pubs/citations/journals/surveys/1998-30-1/p70-thomasian/}, Volume = {30}, Year = {1998}} @article{Thompson97, Author = {Thompson, C.}, Date-Modified = {2006-08-01 14:02:40 +0900}, Journal = {DBMS}, Keywords = {database replication}, Language = {English}, Number = {5}, Title = {Database Replication}, Url = {http://www.dbmsmag.com/9705d15.html}, Volume = {10}, Year = {1997}} @inproceedings{Triantafillou92, Address = {Monterey, California}, Author = {Triantafillou, P.}, Booktitle = {Proceedings of the Second Workshop on the Management of Replicated Data}, Editor = {Paris, J.-F. and Molina, H. G.}, Language = {English}, Pages = {40--43}, Publisher = {{IEEE} Computer Society Press}, Title = {High Availability is not Enough}, Year = {1992}} @techreport{UDS00, Address = {Switzerland}, Author = {Urb{\'a}n, P. and D{\'e}fago, X. and Schiper, A.}, Date-Modified = {2006-08-01 13:47:55 +0900}, Institution = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Keywords = {performance, distributed systems}, Language = {English}, Number = {DSC/2000/012}, Title = {Contention-Aware Metrics: Analysis of Distributed Algorithms}, Url = {http://infoscience.epfl.ch/search.py?recid=52343}, Year = {2000}, Abstract = {Resource contention is widely recognized as having a major impact on the performance of distributed algorithms. Nevertheless, the metrics that are commonly used to predict their performance take little or no account of contention. In this paper, we define two performance metrics for distributed algorithms that account for network contention as well as CPU contention. We then illustrate the use of these metrics by comparing four Atomic Broadcast algorithms, and show that our metrics allow for a deeper understanding of performance issues than conventional metrics. }} @inproceedings{UDS00b, Author = {Urb{\'a}n, P. and D{\'e}fago, X. and Schiper, A.}, Booktitle = {Proceedings of the $9^{th}$ International Conference on Computer Communications and Networks (IC3N 2000)}, Date-Modified = {2006-05-17 16:58:09 +0900}, Keywords = {performance, atomic broadcast}, Language = {English}, Organization = {{IEEE}}, Pages = {582--589}, Title = {Contention-Aware Metrics for Distributed Algorithms: Comparison of Atomic Broadcast Algorithms}, Url = {http://infoscience.epfl.ch/search.py?recid=49993}, Year = {2000}, Abstract = {Contention-Aware Metrics for Distributed Algorithms: Comparison of Atomic Broadcast Algorithms Peter Urban, Xavier Defago and Andre Schiper Resource contention is widely recognized as having a major impact on the performance of distributed algorithms. Nevertheless, the metrics that are commonly used to predict their performance take little or no account of contention. In this paper, we define two performance metrics for distributed algorithms that account for network contention as well as CPU contention. We then illustrate the use of these metrics by comparing four Atomic Broadcast algorithms, and show that our metrics allow for a deeper understanding of performance issues than conventional metrics.}} @inproceedings{UDS01, Author = {Urb{\'a}n, P. and D{\'e}fago, X. and Schiper, A.}, Booktitle = {Proc. of the 15th Int'l Conf. on Information Networking (ICOIN-15)}, Date-Modified = {2006-08-01 13:48:49 +0900}, Keywords = {group communication, toolkit}, Language = {English}, Location = {Beppu, Japan}, Title = {Neko: {A} Single Environment to Simulate and Prototype Distributed Algorithms}, Url = {http://infoscience.epfl.ch/search.py?recid=49974}, Year = {2001}, Abstract = {Neko: A Single Environment to Simulate and Prototype Distributed Algorithms Peter Urban, Xavier Defago and Andre Schiper Designing, tuning, and analyzing the performance of distributed algorithms and protocols are complex tasks. A major factor that contributes to this complexity is the fact that there is no single environment to support all phases of the development of a distributed algorithm. This paper presents Neko, an easy to use Java platform that provides a uniform and extensible environment for the various phases of algorithm design and performance evaluation: prototyping, tuning, simulation, deployment, etc. Keywords: simulation, prototyping, distributed algorithms, message passing, middleware, Java.}, Annote = {Best Student Paper award.}} @techreport{VCKD99, Address = {77 Massachusetts Avenue Cambridge, MA 02139-4307 USA}, Author = {Vitenberg, R. and Chockler, G.~V. and Keidar, I. and Dolev, D.}, Date-Modified = {2006-08-01 13:54:14 +0900}, Institution = {Massachusetts Institute of Technology}, Keywords = {group communication}, Language = {English}, Number = {MIT-LCS-TR-790}, Title = {Group Communication Specifications: {A} comprehensive Study}, Url = {http://theory.lcs.mit.edu/~idish/ftp/gcs-survey-tr.ps}, Year = {1999}} @techreport{VD00, Address = {Switzerland}, Author = {Vaduva, A. and Dittrich, K.~R.}, Institution = {Department of Computer Science, University of Z{\"{u}}rich}, Language = {English}, Number = {ifi-2000.08}, Title = {Metadata Management for Data Warehousing: Between Vision and Reality}, Url = {ftp://ftp.ifi.unizh.ch/pub/techreports/TR-2000/ifi-2000.08.pdf}, Year = {2000}} @mastersthesis{Van00, Address = {Netherlands and Switzerland}, Author = {Vandewall, R.}, Date-Modified = {2006-08-01 13:46:05 +0900}, Keywords = {Dragon, CORBA, group communication}, Language = {English}, School = {Rijksuniversiteit Groningen and {\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Database Replication Prototype}, Url = {http://infoscience.epfl.ch/search.py?recid=49994&ln=fr}, Year = {2000}, Abstract = {This report describes the design of a Replication Framework that facilitates the implementation and com-parison of database replication techniques. Furthermore, it discusses the implementation of a Database Replication Prototype and compares the performance measurements of two replication techniques based on the Atomic Broadcast communication primitive: pessimistic active replication and optimistic active replication. The main contributions of this report can be split into four parts. Firstly, a framework is proposed that accommodates the comparison of various replication techniques. Secondly, the implementation requirements and the theoretical performance characteristics of the pessimistic and the optimistic active replication techniques are thoroughly analysed. Thirdly, the two techniques have been implemented within the framework as a proof of concept, forming the Database Replication Prototype. Finally, we present the performance results obtained using the Database Replication Prototype. They show that in large-scale networks, optimistic active replication outperforms pessimistic active replication. }} @inproceedings{Vaysburd99, Address = {Ecole Polytechnique F\'{e}d\'{e}rale Lausanne, Switzerland}, Author = {Vaysburd, A.}, Booktitle = {Proceedings of $18^{th}$ {IEEE} Symposium on Reliable Distributed Systems {SRDS'99}}, Institution = {Lucent Technologies, Bell Laboratories, 600 Moutain Avenue, Murray Hill, {NJ} 07974 {USA}}, Isbn = {0-7695-0290-3}, Issn = {1060-9857}, Keywords = {Database Replication}, Language = {English}, Month = {19--22}, Organization = {{IEEE} Computer Society, {LSE} Operating System Lab {EPFL}}, Pages = {322--327}, Publisher = {{IEEE} Computer Society Press}, Title = {Fault Tolerance in Three-Tier Applications: Focusing on the Database Tier}, Year = {1999}} @inproceedings{Vitek97, Author = {Vitek, J.}, Booktitle = {Proceedings of the 2nd European Research Seminar on Advances in Distributed Systems ({ERSADS'97})}, Date-Modified = {2006-03-08 19:38:41 +0900}, Key = {Mobile Computing}, Keywords = {distributed computing}, Language = {English}, Pages = {117--122}, Title = {New Paradigms for Distributed Programming}, Year = {1997}} @inproceedings{WAL97, Address = {Atlanta, Georgia, {USA}}, Affiliation = {AT&T Research Labs and University of Texas at Austin and New York University}, Author = {Wang, Y.~M. and Amani, O.~P. and Lee, W.~J.}, Booktitle = {Proceedings of $4^{th}$ International Workshop on Community Networking (CN4)}, Language = {English}, Note = {position paper}, Organization = {IEEE}, Pages = {59--63}, Publisher = {IEEE}, Title = {Reliability Issues in Distributed Component Object Model ({DCOM})}, Url = {http://www.bell-labs.com/user/woeijyhlee/pubs/cn4/CN4.ps}, Year = {1997}} @inproceedings{WD98, Address = {New Orleans, Louisiana, {USA}}, Author = {Westerlund, A. and Danielsson, J.}, Booktitle = {Usenix Annual Technical Conference}, Date-Modified = {2006-05-17 16:41:32 +0900}, Keywords = {filesystem}, Language = {English}, Title = {Arla - a free {AFS} Client}, Url = {http://www.usenix.org/publications/library/proceedings/usenix98/freenix/arla1.ps}, Year = {1998}} @inproceedings{WPS+00, Author = {Wiesmann, M. and Pedone, F. and Schiper, A. and Kemme, B. and Alonso, G.}, Booktitle = {Proceedings of $20^{th}$ International Conference on Distributed Computing Systems ({ICDCS}'2000)}, Date-Modified = {2006-08-01 12:10:41 +0900}, Keywords = {Dragon, database replication}, Language = {English}, Location = {Taipei, Taiwan, R.O.C.}, Organization = {IEEE}, Title = {Understanding replication in databases and distributed systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/ICDCS.2000.840959}, Year = {2000}, Abstract = {Replication is an area of interest to both distributed systems and databases. The solutions developed from these two perspectives are conceptually similar but differ in many aspects: model, assumptions, mechanisms, guarantees provided, and implementation. In this paper, we provide an abstract and ``neutral'' framework to compare replication techniques from both communities. The framework has been designed to emphasize the role played by different mechanisms and to facilitate comparisons. The paper describes the replication techniques used in both communities, compares them, and points out ways in which they can be integrated to arrive to better, more robust replication protocols.}} @inproceedings{WPS+00b, Abstract-Url = {http://lsewww.epfl.ch/Documents/abstract/WPS+00b.txt}, Author = {Wiesmann, M. and Pedone, F. and Schiper, A. and Kemme, B. and Alonso, G.}, Author-Url = {http://lsewww.epfl.ch/wiesmann/}, Booktitle = {Proceedings of $19^{th}$ Symposium on Reliable Distributed Systems (SRDS'2000)}, Date-Modified = {2006-08-01 11:57:42 +0900}, Isbn = {0-7695-0601-1}, Issn = {1063-6927}, Keywords = {Dragon, database replication}, Language = {English}, Location = {N{\"u}rnberg, Germany}, Order-Number = {PR--601}, Organization = {IEEE}, Pages = {206--215}, Title = {Database Replication Techniques: a three parameter classification}, Url = {http://doi.ieeecomputersociety.org/10.1109/RELDI.2000.885408}, Year = {2000}, Abstract = {Data replication is an increasingly important topic as databases are increasingly deployed over clusters of workstations. One of the challenges in database replication is to introduce replication without severely affecting performance. Because of this difficulty, current database products use lazy replication, which is very efficient but can compromise consistency. As an alternative, eager replication guarantees consistency but most existing, protocols have a prohibitive cost. In order to clarify the current state of the art and open up new avenues for research, this paper analyzes existing eager techniques using three key parameters. In our analysis, we distinguish eight classes of eager replication protocols and, for each category, discuss its requirements, capabilities, and cost. The contribution lies in showing when eager replication is feasible and in spelling out the different aspects a database, replication protocol must account for.}} @inproceedings{WPS99, Author = {Wiesmann, M. and Pedone, F. and Schiper, A.}, Booktitle = {Proceedings of the $3^{rd}$ Europeean Research Seminar on Advances in Distributed Systems ({ERSADS}'99)}, Date-Modified = {2006-08-01 11:51:23 +0900}, Keywords = {Dragon, classification, database replication, group communication}, Language = {English}, Location = {Madeira Island (Portugal)}, Organization = {{BROADCAST} {E}sprit WG 22455}, Pages = {264--274}, Title = {A Systematic Classification of Replicated Database Protocols based on Atomic Broadcast}, Url = {http://infoscience.epfl.ch/search.py?recid=50016}, Year = {1999}, Abstract = { Database replication protocols based on group communication primitives have recently emerged as a promising technology to improve database fault- tolerance and performance. Roughly speaking, this approach consists in exploiting the order and atomicity properties provided by group communication primitives or, more specilically Atomic Broadcast, to guarantee transaction properties. This paper proposes a systematic classi ication of non voting database repli- cation algorithms based on Atomic Broadcast. }} @techreport{WS00, Author = {Wang, M. and Suda, T.}, Date-Modified = {2005-11-02 15:25:38 +0900}, Institution = {Departement of Information and Computer Science, University of California, Irvine {USA}}, Keywords = {network, artificial life, fault-tolerance}, Language = {English}, Number = {00-03}, Title = {The Bio-Networking Architecture: {A} Biologically Inspired Approach to the Design of Scalable, Adaptive and Survivable / Available Network Applications}, Year = {2000}, Annote = {Not very conclusions}} @inproceedings{WS84, Author = {Weikum, G. and Schek, H.-J.}, Booktitle = {Proceedings of the 10th {VLDB} Conference}, Date-Modified = {2006-08-23 18:19:32 +0900}, Keywords = {transaction processing}, Language = {English}, Location = {Singapore}, Title = {Architectural Issues of Transaction Management in Multi-Layered Systems}, Year = {1984}} @inproceedings{WS95, Author = {Wilhelm, U. G. and Schiper, A.}, Booktitle = {Proceedings of the $14^{th}$ Symposium on Reliable Distributed Systems ({SRDS-14})}, Date-Modified = {2006-08-01 13:44:29 +0900}, Isbn = {0-8186-7153-X}, Keywords = {atomic broadcast, toolkit}, Language = {English}, Location = {Bad Neuenahr, Germany}, Organization = {IEEE}, Title = {A Hierarchy of Totally Ordered Multicasts}, Url = {http://infoscience.epfl.ch/search.py?recid=50163&ln=fr}, Year = {1995}, Abstract = {The increased interest in protocols that provide a total order on message delivery has led to several different definitions of total order. In this paper we investigate these different definitions and propose a hierarchy that helps to better understand the implications of the different possibilities in terms of guarantees and communication cost. We identify two definitions: weak total order and strong total order, which are at the extremes of the proposed hierarchy, and incorporate them into a consistent design. Finally, we propose high-level algorithms based on a virtually synchronous communication environment that implement the given definitions.}} @article{WV00, Author = {abd J. Voas, J. A. Whittaker}, Journal = {Computer}, Language = {English}, Number = {12}, Pages = {36--42}, Title = {Toward a More Reliable Theory of Software Reliability}, Volume = {33}, Year = {2000}} @unpublished{Wagner99, Author = {Wagner, G.}, Language = {English}, Note = {Seminar Report}, Title = {{RAID} Disk Storage}, Url = {http://citeseer.nj.nec.com/270179.html}, Year = {1999}} @article{Watterson96, Author = {Watterson, K.}, Date-Modified = {2006-08-01 14:05:10 +0900}, Issn = {0011-6963}, Journal = {Datamation}, Keywords = {database replication}, Language = {English}, Number = {15}, Pages = {62--68}, Title = {Database Replication Explained}, Volume = {42}, Year = {1996}} @phdthesis{Wiesmann02, Address = {Switzerland}, Author = {Wiesmann, M.}, Date-Modified = {2006-08-01 13:22:32 +0900}, Keywords = {group communication, database replication}, Language = {English}, Month = {May}, Number = {2577}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Group Communications and Database Replication: Techniques, Issues and Performance}, Url = {URL : http://library.epfl.ch/theses/?nr=2577}, Year = {2002}, Abstract = {Databases are an important part of today's IT infrastructure: both companies and state institutions rely on database systems to store most of their important data. As we are more and more dependent on database systems, securing this key facility is now a priority. Because of this, research on fault-tolerant database systems is of increasing importance. One way to ensure the fault-tolerance of a system is by replicating it. Replication is a natural way to deal with failures: if one copy is not available, we use another one. However implementing consistent replication is not easy. Database replication is hardly a new area of research: the first papers on the subject are more than twenty years old. Yet how to build an efficient, consistent replicated database is still an open research question. Recently, a new approach to solve this problem has been proposed. The idea is to rely on some communication infrastructure called group communications. This infrastructure offers some high-level primitives that can help in the design and the implementation of a replicated database. While promising, this approach to database replication is still in its infancy. This thesis focuses on group communication-based database replication and strives to give an overall understanding of this topic. This thesis has three major contributions. In the structural domain, it introduces a classification of replication techniques. In the qualitative domain, an analysis of fault-tolerance semantics is proposed. Finally, in the quantitative domain, a performance evaluation of group communication-based database replication is presented. The classification gives an overview of the different means to implement database replication. Techniques described in the literature are sorted using this classification. The classification highlights structural similarities of techniques originating from different communities (database community and distributed system community). For each category of the classification, we also analyse the requirements imposed on the database component and group communication primitives that are needed to enforce consistency. Group communication-based database replication implies building a system from two different components: a database system and a group communication system. Fault-tolerance is an end-to-end property: a system built from two components tends to be as fault-tolerant as the weakest component. The analysis of fault-tolerance semantics show what fault-tolerance guarantee is ensured by group communication based replication techniques. Additionally a new faulttolerance guarantee, group-safety, is proposed. Group-safety is better suited to group communication-based database replication. We also show that group-safe replication techniques can offer improved performance. Finally, the performance evaluation offers a quantitative view of group communication based replication techniques. The performance of group communication techniques and classical database replication techniques is compared. The way those different techniques react to different loads is explored. Some optimisation of group communication techniques are also described and their performance benefits evaluated.}} @mastersthesis{Wiesmann97, Address = {24 rue G{\'e}n{\'e}ral Dufour, CH-1211 Geneva 4, Switzerland}, Author = {Wiesmann, M.}, Date-Modified = {2006-08-01 13:10:39 +0900}, Keywords = {fault-tolerance}, Language = {English}, School = {Departement of Computer Science, University of Geneva}, Title = {Management Enhanced Components}, Url = {http://ddsg.jaist.ac.jp/pub/Wie97.pdf}, Year = {1997}} @phdthesis{Wolf98, Abstract-Url = {http://web.onetelnet.ch/~twolf/tw/thesis.html}, Address = {Switzerland}, Author = {Wolf, T.}, Language = {English}, School = {\'{E}cole Polytechnique F\'{e}d\'{e}rale de Lausanne}, Title = {Replication of non-deterministic objects}, Url = {http://web.onetelnet.ch/~twolf/tw/docs/thesis.A4.ps.gz}, Year = {1998}} @article{Wool98, Affiliation = {Bell Labs, Lucent Technologies, Murray Hills, New Jersey, USA}, Author = {Wool, A.}, Author-Url = {http://www.bell-labs.com/~yash}, Journal = {Bulletin of the Technical Commitee on Data Engineering}, Language = {English}, Number = {4}, Organization = {{IEEE} Computer Society}, Pages = {3--11}, Title = {Quorum Systems in Replicated Databases Science or Fiction}, Url = {ftp://ftp.research.microsoft.com/pub/debull/dec98-a4draft.ps}, Volume = {21}, Year = {1998}} @techreport{Zhang95c, Author = {Zhang, A. and Nodine, M. and Bhargava, B.}, Institution = {Department of Computer Science, SUNY Buffalo}, Language = {english}, Number = {94-43}, Title = {Ensuring Semi-Atomicity in Heterogeneous Distributed Database Systems}, Url = {ftp://ftp.cs.buffalo.edu/pub/tech-reports/94-43.ps.Z}, Year = {1995}} @phdthesis{Levine99, Address = {Santa Cruz, USA}, Author = {Levine, B.~L.}, School = {University of California}, Title = {Network Support for Group Communications}, Url = {http://citeseer.nj.nec.com/levine99network.html}, Year = {1999}} @inproceedings{SB01, Author = {Singh, G. and Badarpura, S.}, Booktitle = {Proceedings of the $21^{st}$ International Conference on Distributed System Workshops (ICDCSW'01)}, Date-Modified = {2006-08-01 14:01:00 +0900}, Keywords = {atomic broadcast}, Title = {Application Ordering in Group Communication}, Year = {2001}} @inproceedings{VR02, Address = {Osaka University, Suita, Japan}, Author = {Vicente, P. and Rodrigues, L.}, Booktitle = {Proceedings of the $21^{st}$ Symposium on Reliable Distributed Systems}, Date-Modified = {2006-08-01 13:59:49 +0900}, Keywords = {atomic broadcast, optimistic technique}, Organization = {IEEE}, Pages = {92--101}, Title = {An Indulgent Uniform Total Order Algorithm with Optimistic Delivery}, Year = {2002}} @techreport{LDAP95, Author = {Yeong, W. and Howes, T. and Kille, S.}, Date-Modified = {2006-03-08 15:30:52 +0900}, Institution = {Internet Engineering Task Force (IETF)}, Keywords = {LDAP, RFC}, Number = {1777}, Title = {Lightweight Directory Access Protocol}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc1777.txt}, Year = {1995}, Abstract = {The protocol described in this document is designed to{\newline} provide access to the X.500 Directory while not{\newline} incurring the resource requirements of the Directory{\newline} Access Protocol (DAP). This protocol is specifically{\newline} targeted at simple management applications and browser{\newline} applications that provide simple read/write interactive{\newline} access to the X.500 Directory, and is intended to be a{\newline} complement to the DAP itself. Key aspects of LDAP are:{\newline} - Protocol elements are carried directly over TCP or{\newline} other transport, bypassing much of the{\newline} session/presentation overhead. - Many protocol data{\newline} elements are encoding as ordinary strings (e.g.,{\newline} Distinguished Names). - A lightweight BER encoding is{\newline} used to encode all protocol elements.}} @techreport{RFC1157, Author = {Case, J. and Fedor, M. and Schoffstall, M. and Davin, J.}, Date-Modified = {2006-07-14 17:54:20 +0900}, Institution = {IETF}, Keywords = {SNMP, RFC}, Number = {1157}, Title = {A Simple Network Management Protocol ({SNMP})}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc1157.txt}, Year = {1990}} @book{HSG99, Author = {Howes, T.~A. and Smith, M.~C. and Good, G.~S.}, Isbn = {1-57870-070-1}, Keywords = {LDAP}, Publisher = {Macmillan Technical Publishing}, Title = {Understanding and Deploying {LDAP} Directory Services}, Url = {http://developer.netscape.com/docs/books/macmillan/ldap/ldapbk.html}, Year = {1999}} @mastersthesis{Rei02, Address = {Switzerland}, Author = {Reichenbach, F.}, Date-Modified = {2006-08-01 13:51:51 +0900}, Keywords = {SNMP, failure detection}, Language = {French}, School = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Title = {Service {SNMP} de d{\'e}tection de faute pour des syst{\`e}mes r{\'e}partis}, Type = {Diploma Thesis}, Url = {http://infoscience.epfl.ch/search.py?recid=49953}, Year = {2002}} @article{KSMD02, Author = {Keidar, I. and Sussman, J. and Marzullo, K. and Dolev, D.}, Date-Modified = {2006-05-17 16:38:18 +0900}, Issn = {0734-2071}, Journal = {ACM Transactions on Computer Systems}, Keywords = {failure detection, toolkit, service}, Language = {English}, Month = {August}, Number = {3}, Pages = {191--238}, Title = {{Moshe}: {A} Group Membership Service for {WAN}s}, Url = {http://doi.acm.org/10.1145/566340.566341}, Volume = {20}, Year = {2002}, Abstract = {We present Moshe, a novel scalable group membership algorithm built specifically for use in wide area networks (WANs), which can suffer partitions. Moshe is designed with three new significant features that are important in this setting: it avoids delivering views that reflect out-of-date memberships; it requires a single round of messages in the common case; and it employs a client-server design for scalability. Furthermore, Moshe's interface supplies the hooks needed to provide clients with full virtual synchrony semantics. We have implemented Moshe on top of a network event mechanism also designed specifically for use in a WAN. In addition to specifying the properties of the algorithm and proving that this specification is met, we provide empirical results of an implementation of Moshe running over the Internet. The empirical results justify the assumptions made by our design and exhibit good performance. In particular, Moshe terminates within a single communication round over 98% of the time. The experimental results also lead to interesting observations regarding the performance of membership algorithms over the Internet.}} @inproceedings{FDG+99, Author = {Felber, P. and D{\'e}fago, X. and Guerraoui, R. and Oser, P.}, Booktitle = {Proc of the Int. Symp. on Distributed Objects and Applications (DOA)}, Date-Modified = {2006-07-14 18:01:49 +0900}, Keywords = {failure detection, toolkit}, Location = {Edinburgh, Scotland}, Pages = {132--141}, Title = {Failure Detectors as First Class Objects}, Url = {http://lsewww.epfl.ch/Publications/ById/207.html}, Year = {1999}} @unpublished{Ban02, Author = {Ban, B.}, Date-Modified = {2006-03-08 19:22:44 +0900}, Institution = {Departement of Computer Science, Cornell University}, Keywords = {group communication}, Language = {English}, Title = {Implementing Group Protocol Using Dynmic Remote Method Calls}, Url = {http://www.javagroups.com/javagroupsnew/docs/papers/rmc.ps.gz}, Year = {2002}} @phdthesis{Ban97, Address = {Switzerland}, Author = {Ban, B.}, Keywords = {CORBA, SNMP, CMIP}, Language = {English}, School = {Wirtschaftwissenschaftlichen Fakult{\"a}t der Universit{\"a}t Z{\"u}rich}, Title = {A Generic Management Model for {CORBA}, {CMIP} and {SNMP}}, Url = {http://www.cs.cornell.edu/home/bba/papers.html}, Year = {1997}, Abstract = {The predominant models to perform network and systems{\newline} management have traditionally been SNMP and CMIP.{\newline} However, SNMP is beginning to reach its limits when{\newline} more complex management tasks are to be performed, and{\newline} CMIP -- owing to its complexity and slow{\newline} standardization process -- has not yet gained the same{\newline} acceptance as SNMP. The advent of object-oriented{\newline} distributed processing models has brought forward a{\newline} third candidate, CORBA, originally not targeted{\newline} specifically at management tasks, but in many respects{\newline} nevertheless suitable for managing both local and wide{\newline} area networks. CORBA is more powerful than SNMP and{\newline} less complex than CMIP. Its affiliation with C++, a{\newline} widely used programming language, lends it to immediate{\newline} use by a huge number of programmers and allows them to{\newline} introduce distribution into their programs without a{\newline} too drastic change of philosophy. It is therefore{\newline} assumed that CORBA will become important in the network{\newline} and systems management domain as well as in the{\newline} distributed systems domain. To be more precise, CORBA{\newline} will be used to implement management applications{\newline} (managers) and managed entities (agents). The author{\newline} assumes that, in the near future, all models will have{\newline} to coexist, because investments in older models have{\newline} been made, because CORBA may not yet be ready for{\newline} certain specialized management tasks (e.g. embedded{\newline} management agents), or simply for political reasons.{\newline} Therefore, the need arises to manage a system written{\newline} in one model from a system written in a different{\newline} model, for example to manage an OSI agent using a{\newline} CORBA-based manager. Assuming that CORBA will achieve{\newline} broad acceptance in the management world, it would be{\newline} desirable to be able to manage other models{\newline} transparently from CORBA. The benefits would be{\newline} investment protection of existing managed entities, the{\newline} opening of the SNMP- and CMIP-dominated management{\newline} world to management-inexperienced (CORBA) programmers,{\newline} and the unification of management in one common,{\newline} simple, and yet powerful model. The goal of this work{\newline} is to examine how CORBA can be used for network and{\newline} systems management. The focus is on the client side;{\newline} that is, how CORBA can be used to implement management{\newline} applications that access managed entities, rather than{\newline} how CORBA can be used to implement managed entities.{\newline} Work in this area already exists; an overview will be{\newline} given and it will be shown that most work focuses on{\newline} compile-time static translation of one model to{\newline} another. This generates a number of problems. Therefore{\newline} a dynamic runtime-based approach is proposed that{\newline} eliminates these problems and has several advantages{\newline} over static approaches. The proposed model is a{\newline} combination of a generic object model, metadata, and{\newline} adapters that convert between the generic and specific{\newline} target models. Although the elements of the proposed{\newline} models are established and well known, their{\newline} combination and their application to the domain of{\newline} management is novel and makes for the issues of{\newline} interest in this thesis.}} @techreport{APEX02, Author = {Rose, M. and Klyne, G. and Crocker, D.}, Date-Modified = {2006-08-01 13:57:54 +0900}, Institution = {The Internet Society (IETF)}, Keywords = {RFC}, Language = {English}, Number = {3340}, Title = {The Application Exchange Core}, Type = {RFC}, Url = {http://www.beepcore.org/beepcore/docs/rfc3340.html}, Year = {2002}, Abstract = {This memo describes Application Exchange (APEX) Core, an extensible, asynchronous message relaying service for application layer programs.}} @techreport{BEEP01, Author = {Rose, M.}, Date-Modified = {2006-08-01 13:58:01 +0900}, Institution = {Internet Society (IETF)}, Keywords = {RFC}, Language = {English}, Number = {3080}, Title = {The Blocks Extensible Exchange Protocol Core}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc3080.txt}, Year = {2001}} @techreport{APEX:PUBSUB01, Author = {Schwartz, M. and Rose, M. and Carlberg, K.}, Date-Modified = {2006-05-17 16:35:54 +0900}, Institution = {Internet Society (IETF)}, Keywords = {MOM, standard}, Language = {English}, Title = {The {APEX} Publish-Subscribe Service}, Type = {Draft}, Url = {http://www.codeontheroad.com/papers/draft-schwartz-apex-pubsub.html}, Year = {2001}} @inproceedings{LBPK01, Address = {Heidelberg, Germany}, Author = {Laumay, P. and Bruneton, E. and de Palma, N. and Krakowiak, S.}, Booktitle = {Proceedings of the Middleware 2001 : IFIP/ACM International Conference on Distributed Systems Platforms}, Date-Modified = {2006-05-17 16:40:07 +0900}, Keywords = {MOM, toolkit, causal order}, Language = {English}, Pages = {311--329}, Publisher = {Lecture Notes in Computer Science, Springer Verlag}, Title = {Preserving Causality in a Scalable Message-Oriented Middleware}, Url = {http://citeseer.nj.nec.com/laumay01preserving.html}, Volume = {2218}, Year = {2001}} @inproceedings{PAP99, Address = {Calcutta, India}, Author = {Praveen, H. and Arvindam, S. and Pokarna, S.}, Booktitle = {Proceedings of the $6^{th}$ International Conference on High Performance Computing (HoPC)}, Date-Modified = {2005-11-02 15:28:31 +0900}, Keywords = {consensus, group communication, atomic broadcast}, Language = {English}, Title = {Thunderbolt: {A} Consensus-Based Infrastructure for Loosely Coupled Cluster Computing}, Url = {http://www.hipc.org/hipc99/papers/paper9.ps.gz}, Year = {1999}} @inproceedings{HCK02, Author = {Hayashibara, N. and Cherif, A. and Katayama, T.}, Booktitle = {Proc. of the 1$^{st}$ Workshop on Self-Repairing and Self-Configurable Distributed Systems (RCDS)}, Date-Modified = {2006-07-15 11:10:18 +0900}, Keywords = {failure detection}, Language = {English}, Location = {Osaka, Japan}, Main-Conference = {21st IEEE Int'l Symp. on Reliable Distributed Systems (SRDS-21)}, Pages = {404--409}, Title = {Failure detectors for large-scale distributed systems}, Url = {http://ddsg.jaist.ac.jp/en/pub/HCK02.html}, Year = {2002}, Abstract = {This paper discusses the problem of implementing a scalable failure detection service for Grid systems. More specifically, traditional implementations of failure detectors are often tuned for running over local networks and fail to address some important problems found in wide-area distributed systems, such as Grid systems. We identify some of the most important problems raised in the context of Grids. We then survey recent propositions that can help in solving some of these problems. }} @article{CTA02, Author = {Chen, W. and Toueg, S. and Aguilera, K.~M.}, Date-Modified = {2005-11-02 15:25:03 +0900}, Journal = {IEEE Transactions on Computers}, Keywords = {failure detection}, Language = {English}, Number = {2}, Pages = {561--580}, Title = {On the Quality of Service of Failure Detectors}, Url = {http://www.hpl.hp.com/personal/Marcos_Aguilera/papers/qos_ieee_tc2002.pdf}, Volume = {51}, Year = {2002}, Abstract = {We study the quality of service (QoS) of failure detectors. By QoS, we mean a specification that quantifies 1) how fast the failure detector detects actual failures and 2) how well it avoids false detections. We first propose a set of QoS metrics to specify failure detectors for systems with probabilistic behaviors, i.e., for systems where message delays and message losses follow some probability distributions. We then give a new failure detector algorithm and analyze its QoS in terms of the proposed metrics. We show that, among a large class of failure detectors, the new algorithm is optimal with respect to some of these QoS metrics. Given a set of failure detector QoS requirements, we show how to compute the parameters of our algorithm so that it satisfies these requirements and we show how this can be done even if the probabilistic behavior of the system is not known. We then present some simulation results that show that the new failure detector algorithm provides a better QoS than an algorithm that is commonly used in practice. Finally, we suggest some ways to make our failure detector adaptive to changes in the probabilistic behavior of the network. }} @techreport{IP81, Author = {Postel, J.}, Date-Modified = {2006-03-08 15:32:04 +0900}, Institution = {Internet Society (IETF)}, Keywords = {RFC, IP}, Language = {English}, Number = {791}, Title = {Internet Protocol {DARPA} Internet Program Protocol Specification}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc0791.txt}, Year = {1981}} @techreport{UDP80, Author = {Postel, J.}, Date-Modified = {2006-03-08 15:31:40 +0900}, Institution = {Internet Society (IETF)}, Keywords = {IP, RFC}, Language = {English}, Number = {768}, Title = {User Datagram Protocol}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc768.txt}, Year = {1980}} @techreport{TCP81, Author = {Postel, J.}, Date-Modified = {2006-03-08 15:32:13 +0900}, Institution = {Internet Society (IETF)}, Keywords = {RFC, IP}, Language = {English}, Number = {793}, Title = {Transmission Control Protocol}, Type = {RFC}, Url = {http://www.ietf.org/rfc/rfc793.txt}, Year = {1981}} @manual{JMS02, Address = {901 San Antonio Road Palo Alto, CA 94303 {USA}}, Author = {Hapner, M. and Burridge, R. and Sharma, R. and Fialli, J. and Stout, K.}, Date-Modified = {2006-05-17 16:38:06 +0900}, Keywords = {JMS, Java, MOM}, Language = {English}, Organization = {Sun Microsystems, Inc.}, Title = {Java Message Service}, Year = {2002}} @article{CKV01, Author = {Chockler, G.~V. and Keidar, I. and Vitenberg, R.}, Date-Modified = {2005-11-02 15:28:05 +0900}, Journal = {ACM Computing Surveys}, Keywords = {group communication}, Language = {English}, Number = {33}, Pages = {1--43}, Title = {Group communication specifications: {A} comprehensive study.}, Volume = {4}, Year = {2001}} @inproceedings{CFFK01, Author = {Czajkowski, K. and Fitzgerald, S. and Foster, I. and Kesselman, C.}, Booktitle = {Proceedings of the $10^{th}$ Symposium on High Performance Distributed Computing}, Date-Modified = {2006-04-04 18:47:39 +0900}, Keywords = {grid}, Language = {English}, Title = {Grid Information Services for Distributed Resource Sharing}, Url = {http://citeseer.nj.nec.com/czajkowski01grid.html}, Year = {2001}} @techreport{AT01, Address = {Baltimore, MD 21218 USA}, Author = {Amir, Y. and Tutu, C.}, Date-Modified = {2005-11-02 15:33:57 +0900}, Institution = {Departement of Computer Science John Hopkins University}, Keywords = {database replication, partitionable membership, atomic broadcast}, Language = {English}, Number = {CNDS-2001-6}, Title = {From Total Order to Database Replication}, Url = {http://citeseer.nj.nec.com/460405.html}, Year = {2001}} @inproceedings{AT02, Address = {Vienna, Austria}, Author = {Amir, Y. and Tutu, C.}, Booktitle = {Proceedings of the 22nd International Conference on Distributed Computing Systems (ICDCS)}, Date-Modified = {2006-03-08 15:28:19 +0900}, Keywords = {database, replication, partitionable membership, atomic broadcast}, Language = {English}, Organization = {IEEE}, Title = {From Total Order to Database Replication}, Url = {http://www.cnds.jhu.edu/pub/papers/AT02_icdcs.pdf}, Year = {2002}} @inproceedings{Kemme02, Address = {Bertinoro, Italy}, Author = {Kemme, B.}, Booktitle = {Proceedings of the International Workshop on Future Directions in Distributed Computing (FuDiCo 2002)}, Date-Modified = {2006-08-01 14:03:34 +0900}, Keywords = {database replication, group communication}, Language = {English}, Title = {Implementing Database Replication based on Group Communication.}, Url = {http://www.cs.mcgill.ca/~kemme/papers/fudico02.pdf}, Year = {2002}} @inproceedings{TML97, Address = {New York, USA}, Author = {Thekkath, C.~A. and Mann, T. and Lee, E.~K.}, Booktitle = {Proceedings of the 16th Symposium on Operating Systems Principles (SOSP-97)}, Date-Modified = {2006-05-17 16:48:06 +0900}, Keywords = {distributed systems, filesystem}, Language = {English}, Pages = {224--237}, Publisher = {ACM Press}, Title = {Frangipani: {A} Scalable Distributed File System}, Url = {http://research.compaq.com/SRC/personal/thekkath/frangipani/frangipani.acmcopyright.ps}, Year = {1997}} @techreport{GL00, Address = {130, Lytton Avenue Palo Alto, California 94301 USA}, Author = {Gafni, E. and Lamport, L.}, Date-Modified = {2006-05-30 11:35:17 +0200}, Institution = {Compaq Systems Research Center}, Keywords = {Paxos, consensus, crash recovery}, Language = {English}, Number = {SRC 163}, Title = {Disk Paxos}, Url = {http://citeseer.nj.nec.com/319923.html}, Year = {2000}, Abstract = {We present an algorithm, called Disk Paxos, for implementing a reliable distributed system with a network of processors and disks. Like the original Paxos algorithm, Disk Paxos maintains consistency in the presence of arbitrary non-Byzantine faults. Progress can be guaranteed as long as a majority of the disks are available, even if all processors but one have failed}} @inproceedings{LT96, Address = {Cambridge, MA USA}, Author = {Lee, E.~K. and Thekkath, C.~A.}, Booktitle = {Proceedings of the Seventh International Conference on Architectural Support for Programming Languages and Operating Systems}, Date-Modified = {2006-05-17 16:35:18 +0900}, Keywords = {filesystem, fault-tolerance, Paxos}, Language = {English}, Pages = {84--92}, Title = {{Petal:} Distibuted Virtual Disks}, Url = {http://citeseer.nj.nec.com/lee96petal.html}, Year = {1996}} @article{FG01, Author = {Fr{\o}lund, S. and Guerraoui, R.}, Date-Modified = {2005-11-02 15:34:24 +0900}, Journal = {IEEE Transactions on Parallel and Distributed Systems}, Keywords = {3-tier, database replication,}, Language = {English}, Number = {2}, Title = {Implementing e-Transactions with Asynchronous Replication}, Url = {http://lpdwww.epfl.ch/rachid/papers/00_010.ps.gz}, Volume = {12}, Year = {2001}} @inproceedings{CHD98, Address = {Puerto Vallarta, Mexico}, Author = {Chockler, G. V and Huleihel, N. and Dolev, D.}, Booktitle = {Proceedings of the seventeenth annual ACM symposium on Principles of distributed computing}, Date-Modified = {2006-03-08 15:28:40 +0900}, Isbn = {0-89791-977-7}, Keywords = {partitionable membership}, Language = {English}, Organization = {ACM}, Pages = {237--246}, Title = {An adaptive totally ordered multicast protocol that tolerates partitions}, Url = {http://doi.acm.org/10.1145/277697.277741}, Year = {1998}} @inproceedings{GS96, Address = {New York {USA}}, Author = {Gokhale, A. and Schmidt, D.~C.}, Booktitle = {Proceedings of the Conference on Applications, Technologies, Architectures, and Protocols for Computer Communications}, Date-Modified = {2006-04-11 10:40:27 +0900}, Isbn = {0-89791-790-1}, Keywords = {middleware}, Language = {English}, Organization = {{ACM} {SIGCOMM}}, Pages = {306--317}, Title = {Measuring the Performance of Communication Middleware on High-Speed Networks}, Url = {http://citeseer.nj.nec.com/gokhale96measuring.html}, Year = {1996}} @article{JM90, Author = {Jajodia, S. and Mutchler, D.}, Date-Modified = {2005-11-02 15:31:21 +0900}, Journal = {ACM Transactions on Database Systems}, Keywords = {database replication, quorum}, Language = {English}, Number = {2}, Pages = {230--280}, Title = {Dynamic Voting Algorithms for Maintaining the Consistency of a Replicated Database}, Url = {http://www.acm.org/pubs/citations/journals/tods/1990-15-2/p230-jajodia/}, Volume = {15}, Year = {1990}} @inproceedings{WDS03, Author = {Wiesmann, M. and D{\'e}fago, X. and Schiper, A.}, Booktitle = {Proc. of the Int. Symp. on Network Computing and Applications (NCA)}, Date-Modified = {2006-08-01 13:41:02 +0900}, Keywords = {group communication, standard}, Location = {Cambridge, MA, USA}, Organization = {IEEE}, Pages = {140--147}, Title = {Group communication based on standard interfaces}, Url = {http://infoscience.epfl.ch/search.py?recid=49935&ln=fr}, Year = {2003}, Abstract = {While group communication system have been proposed for some time, they are still not used much in actual systems. We believe that one reason for this is the lack of standardisation of group communication system interfaces. The paper proposes an architecture, using the standard decomposition into services, were services are based on standard interfaces: both interactions between services and interactions with the application use existing, open standards. A decomposition of the group communication into services is presented, along with a description of applicable standards. As an example, a group membership service based on the LDAP standard is discussed.}} @article{KD98, Author = {Keidar, I. and Dolev, D.}, Date-Modified = {2006-03-08 19:16:27 +0900}, Journal = {Journal of Computer and System Sciences (JCSS)}, Keywords = {quorum, atomic commitment}, Language = {English}, Number = {3}, Pages = {309--224}, Title = {Increasing the Resilience of Distributed and Replicated Database Systems}, Url = {http://theory.lcs.mit.edu/~idish/Abstracts/jcss.html}, Volume = {57}, Year = {1998}} @techreport{Schiper03, Address = {Lausanne, Switzerland}, Author = {Schiper, A.}, Date-Modified = {2005-11-02 15:37:06 +0900}, Institution = {Ecole Polytechnique F{\'e}d{\'e}rale Lausanne}, Keywords = {group communication, view synchrony}, Language = {English}, Number = {IC/2003/27}, Title = {Dynamic Group Communication}, Url = {http://lsewww.epfl.ch/Publications/ById/342.html}, Year = {2003}} @article{RR03, Author = {Raynal, M. and Rodrigues, L.}, Date-Modified = {2006-08-01 14:01:26 +0900}, Journal = {IEEE Transactions on Knowledge and Data Engineering}, Keywords = {atomic broadcast}, Language = {English}, Title = {Atomic Broadcast in Asynchronous Crash-Recovery Distributed Systems and its use in Quorum-Based Replication}, Year = {2003}} @inproceedings{Maffeis95, Author = {Maffeis, S.}, Booktitle = {USENIX Conference on Object-Oriented Technologies}, Date-Modified = {2006-05-17 16:10:20 +0900}, Keywords = {CORBA, group communication, toolkit}, Language = {English}, Title = {Adding Group Communication and Fault-Tolerance to {CORBA}}, Url = {http://www.softwired.ch/people/maffeis/articles/research/electra_corba.pdf}, Year = {1995}} @techreport{WS03, Address = {Switzerland}, Author = {Wiesmann, M. and Schiper, A.}, Date-Modified = {2006-09-15 15:25:30 +0900}, Institution = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Keywords = {database, replication, consistency}, Language = {English}, Number = {IC/2003/18}, Title = {Beyond 1-Safety and 2-Safety for Replicated Databases: Group-Safety}, Url = {http://infoscience.epfl.ch/getfile.py?recid=49907&mode=best}, Year = {2003}, Abstract = {In this paper, we study the safety guarantees of group communication-based database replication techniques. We show that there is a model mismatch between group communication and database, and because of this, classical group communication systems cannot be used to build 2-safe database replication. We propose a new group communication primitive called end-to-end atomic broadcast that solves the problem, i.e., can be used to implement 2-safe database replication. We also introduce a new safety criterion, called group-safety, that has advantages both over 1-safety and 2-safety. Experimental results show the gain of efficiency of group-safety over lazy replication, which ensures only 1-safety.}} @incollection{KD00, Author = {Keidar, I. and Dolev, D.}, Booktitle = {Dependable Network Computing}, Chapter = {3}, Date-Modified = {2006-04-11 10:34:56 +0900}, Editor = {Avresky, D.}, Keywords = {partitionable membership, atomic broadcast, toolkit}, Language = {English}, Publisher = {Kluwer Academic Publications}, Title = {Totally Ordered Broadcast in the Face of Network Partitions}, Year = {2000}} @article{Lamport86, Author = {Lamport, L.}, Date-Modified = {2006-05-01 17:30:17 +0900}, Journal = {Distributed Computing}, Keywords = {distributed computing}, Language = {English}, Pages = {77--101}, Title = {On interprocess communication}, Url = {http://citeseer.nj.nec.com/227917.html}, Volume = {1}, Year = {1986}} @article{Lamport86b, Author = {Lamport, L.}, Journal = {Journal of the ACM}, Language = {English}, Number = {2}, Pages = {313--326}, Title = {The mutual exclusion problem: part {I} -- a theory of interprocess communication}, Url = {http://portal.acm.org/citation.cfm?doid=5383.5384}, Volume = {33}, Year = {1986}} @article{Lamport86c, Author = {Lamport, L.}, Journal = {Journal of the ACM}, Language = {English}, Number = {2}, Pages = {327--348}, Title = {The mutual exclusion problem: part {II} -- statement and solutions}, Url = {http://doi.acm.org/10.1145/5383.5385}, Volume = {33}, Year = {1986}} @phdthesis{Amir95, Address = {Israel}, Author = {Amir, Y.}, Date-Modified = {2006-03-08 15:28:33 +0900}, Keywords = {partitionable membership, database, replication}, Language = {English}, School = {Hebrew University of Jerusalem}, Title = {Replication using group communication over a partitioned network}, Url = {http://www.cnds.jhu.edu/publications/yair-phd.ps}, Year = {1995}} @inproceedings{Cohen03, Address = {Berkeley, CA, USA}, Author = {Cohen, B.}, Booktitle = {Proceedings of the Workshop on Economics of Peer-to-Peer Systems}, Keywords = {P2P}, Language = {English}, Title = {Incentives Build Robustness in BitTorrent}, Url = {http://citeseer.nj.nec.com/cohen03incentives.html}, Year = {2003}} @article{BKK+03, Author = {Balakrishnan, H. and Kaashoek, M. F. and Karger, D. and Morris, R. and Stoica, I.}, Journal = {Communications of the ACM}, Keywords = {P2P}, Language = {English}, Number = {2}, Pages = {43--48}, Title = {Looking up data in {P2P} Systems}, Volume = {46}, Year = {2003}} @inproceedings{PSU+02c, Author = {Pedone, F. and Schiper, A. and Urb{\'a}n, P. and Cavin, D.}, Booktitle = {Proc. Int'l Conf. on Dependable Systems and Networks (DSN), supplemental volume}, Date-Modified = {2006-05-17 16:59:45 +0900}, Keywords = {oracle, failure detection}, Month = {June}, Note = {Fast abstract.}, Pages = {B-32-33}, Title = {Weak Ordering Oracles for Failure Detection-Free Systems}, Url = {http://lsewww.epfl.ch/Publications/ById/304.html}, Year = {2002}} @article{KPA+03, Author = {Kemme, B. and Pedone, F. and Alonso, G. and Schiper, A. and Wiesmann, M.}, Date-Modified = {2006-08-01 12:06:08 +0900}, Journal = {IEEE Transactions on Knowledge and Data Engineering}, Keywords = {Dragon, database replication, optimistic technique, group communication}, Number = {4}, Pages = {1018--1032}, Title = {Using Optimistic Atomic Broadcast in Transaction Processing Systems}, Url = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2003.1209016}, Volume = {15}, Year = {2003}, Abstract = {Atomic broadcast primitives are often proposed as a mechanism to allow fault-tolerant cooperation between sites in a distributed system. Unfortunately, the delay incurred before a message can be delivered makes it difficult to implement high performance, scalable applications on top of atomic broadcast primitives. Recently, a new approach has been proposed for atomic broadcast which, based on optimistic assumptions about the communication system, reduces the average delay for message delivery to the application. In this paper, we develop this idea further and show how applications can take even more advantage of the optimistic assumption by overlapping the coordination phase of the atomic broadcast algorithm with the processing of delivered messages. In particular, we present a replicated database architecture that employs the new atomic broadcast primitive in such a way that communication and transaction processing are fully overlapped, providing high performance without relaxing transaction correctness.}} @techreport{KPS+03, Address = {Switzerland}, Author = {Kup{\v s}ys, A. and Pleisch, S. and Schiper, A. and Wiesmann, M.}, Date-Modified = {2006-08-01 13:51:15 +0900}, Institution = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, Keywords = {MOM}, Language = {English}, Number = {200353}, Title = {Towards {JMS}-Compliant Group Communication}, Url = {http://infoscience.epfl.ch/search.py?recid=52560}, Year = {2003}, Abstract = {Group communication provides communication primitives with various semantics and their use greatly simplifies the development of highly available services. However, despite tremendous advances in research and numerous prototypes, group communication stays confined to small niches and academic prototypes. In contrast, message-oriented middleware such as the Java Messaging Service (JMS) is widely used, and has become a de-facto standard. We believe that the lack of standard interfaces is the reason that hinders the deployment of group communication systems. Since JMS is well-established, an interesting solution is to map group communication primitives onto the JMS API. This requires to adapt the traditional specifications of group communication in order to take into account the features of JMS. The resulting group communication API, together with corresponding specifications, defines group communication primitives compatible with the JMS syntax and semantics.}} @inproceedings{HDK03, Address = {Sorrento, Italy}, Author = {Hayashibara, N. and D{\'e}fago, X. and Katayama, T.}, Booktitle = {Proceedings of the Workshop on Adaptive Distributed Systems (WADiS03)}, Date-Modified = {2005-11-02 15:24:55 +0900}, Keywords = {failure detection}, Month = {October}, Note = {In conjunction with the 17th International Symposium on Distributed Computing (DISC 2003)}, Pages = {22--27}, Title = {Two-ways Adaptive Failure Detection with the phi-Failure Detector}, Url = {http://ddsg.jaist.ac.jp/en/pub/HDK03.html}, Year = {2003}} @inproceedings{DHK03, Address = {Ishikawa, Japan}, Author = {D{\'e}fago, X. and Hayashibara, N. and Katayama, T.}, Booktitle = {Proceedings of International Symposium on Towards Peta-Bit Ultra-Networks (PBit 2003)}, Date-Modified = {2005-11-02 15:22:32 +0900}, Isbn = {4-9900330-3-5}, Keywords = {failure detection}, Language = {English}, Pages = {88--95}, Title = {On the Design of a Failure Detection Service for Large-Scale Distributed Systems}, Url = {http://ddsg.jaist.ac.jp/en/pub/DHK03.html}, Year = {2003}, Abstract = {It is widely recognized that distributed systems would greatly benefit from the availability of a generic failure detection service. There are however several issues that must be addressed before such a service can actually be implemented. In this paper, we highlight the main issues related to ensuring failure detection in large-scale systems, and overview the main solutions proposed in the literature so far. Then, we outline a pragmatic architecture for a failure detector service based on the ?-failure detector, and a combination of techniques proposed in related work. }} @phdthesis{Narasimhan99, Address = {California, USA}, Author = {Narasimhan, P.}, Keywords = {CORBA}, Language = {English}, Month = {September}, School = {University of California, Santa Barbara}, Title = {Transparent Fault Tolerance for {CORBA}}, Url = {http://www.cs.cmu.edu/~priya/thesis.ps}, Year = {1999}} @inproceedings{BMT02, Address = {Osaka, Japan}, Author = {Baldoni, R. and C.Marchetti and A.Termini}, Booktitle = {Proceedings of the 21st Symposium on Reliable Distributed Systems (SRDS'02)}, Keywords = {CORBA}, Language = {English}, Month = {October}, Pages = {109--118}, Publisher = {IEEE}, Title = {Active Software Replication through a Three-tier Approach}, Url = {http://www.dis.uniroma1.it/~irl/pubs/SRDS02.pdf}, Year = {2002}} @inproceedings{WS04, Author = {Wiesmann, M. and Schiper, A.}, Booktitle = {Proc. of the 9$^{th}$ Int. Conf. on Extending Database Technology (EDBT2004)}, Date-Modified = {2006-10-25 17:38:06 +0900}, Isbn = {3-540-21200-0}, Keywords = {serializability, database replication, fault-tolerance}, Language = {English}, Location = {Heraklion, Crete, Greece}, Month = {March}, Organization = {VLDB}, Pages = {165--182}, Publisher = {Springer}, Series = {LNCS}, Title = {Beyond 1-Safety and 2-Safety for replicated databases: Group-Safety}, Url = {http://infoscience.epfl.ch/search.py?recid=49907&ln=fr}, Volume = {2992}, Year = {2004}, Abstract = {In this paper, we study the safety guarantees of group communication-based database replication techniques. We show that there is a model mismatch between group communication and database, and because of this, classical group communication systems cannot be used to build 2-safe database replication. We propose a new group communication primitive called end-to-end atomic broadcast that solves the problem, i.e., can be used to implement 2-safe database replication. We also introduce a new safety criterion, called group-safety, that has advantages both over 1-safety and 2-safety. Experimental results show the gain of efficiency of group-safety over lazy replication, which ensures only 1-safety. }} @article{Gokhale98, Author = {Gokhale, A.~S. and Schmidt, D.~C.}, Journal = {IEEE Transactions on Computers}, Keywords = {CORBA}, Language = {English}, Number = {4}, Pages = {391--413}, Title = {Measuring and Optimizing CORBA Latency and Scalability Over High-Speed Networks}, Url = {http://csdl.computer.org/comp/trans/tc/1998/04/t0391abs.htm}, Volume = {47}, Year = {1998}} @incollection{ES99, Author = {Ezhilchelvan, P. D. and Shrivastava, S. K.}, Booktitle = {Advances in Distributed Systems, Advanced Distributed Computing: From Algorithms to Systems}, Editor = {Krakowiak, S. and Shrivastava, S. K.}, Isbn = {3-540-67196-X}, Language = {English}, Pages = {79--103}, Publisher = {Springer}, Series = {Lecture Notes in Computer Science}, Title = {Enhancing Replica Management Services to Cope with Group Failures}, Url = {http://www.springerlink.com/app/home/contribution.asp?wasp=1a48tvwrrkdye1lp8fvk&referrer=parent&backto=issue,4,21;journal,1074,1397;linkingpublicationresults,id:105633,1}, Volume = {1752}, Year = {1999}} @inproceedings{WMS02a, Author = {Wojciechowski, P. T. and Mena, S. and Schiper, A.}, Booktitle = {Proceedings of Coordination 2002 (The Fifth International Conference on Coordination Models and Languages)}, Editor = {Arbab, F. and Talcott, C.}, Keywords = {micro-protocols composition}, Language = {English}, Month = {April}, Pages = {389--404}, Publisher = {Springer}, Series = {Lecture Notes in Computer Science}, Title = {Semantics of Protocol Modules Composition and Interaction}, Url = {http://lsewww.epfl.ch/Publications/ById/315.html}, Year = {2002}} @manual{Oracle:SCN8, Address = {Oracle Corporation, 500 Oracle Parkway, Redwoord City, CA 94065}, Author = {Leverenz, L. and Mateosian, R. and Bobrowski, S.}, Date-Modified = {2006-08-01 14:13:57 +0900}, Keywords = {transaction processing, commercial}, Language = {English}, Part-No = {A54643-01}, Publisher = {{O}racle Corporation}, Release = {8.0}, Title = {{O}racle8 Server Concepts}, Url = {http://www.oracle.com/support/documentation/oracle8/SCN80.pdf}, Year = {1997}} @comment{BibDesk Smart Groups{ conditions comparison 2 key Author value Wiesmann version 1 conjunction 0 group name My Papers }}