"\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi"
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{inst-CSC = "Center for Scientific Computing,
Department of Mathematics, University of
@String{inst-CSC:adr = "Salt Lake City, UT 84112, USA"}
@String{inst-CSU = "Colorado State University"}
@String{inst-CSU:adr = "Fort Collins, CO, USA"}
@String{inst-NLRC = "NASA Langley Research Center"}
@String{inst-NLRC:adr = "Hampton, VA, USA"}
@String{inst-SRC-IDA = "Supercomputing Research Center: IDA"}
@String{inst-SRC-IDA:adr = "Lanham, MD, USA"}
@String{inst-U-MARYLAND = "University of Maryland"}
@String{inst-U-MARYLAND:adr = "College Park, MD, USA"}
@String{inst-UCB-EECS = "Department of Electrical Engineering and
Computer Science, University of California,
@String{inst-UCB-EECS:adr = "Berkeley, CA, USA"}
@String{inst-UIUC-CSRD = "University of Illinois at Urbana-Champaign,
Center for Supercomputing Research and
@String{inst-UIUC-CSRD:adr = "Urbana, IL 61801, USA"}
@String{inst-UT-CS = "Department of Computer Science, University of
Tennessee, Knoxville"}
@String{inst-UT-CS:adr = "Knoxville, TN 37996, USA"}
@String{j-ACM-COMM-COMP-ALGEBRA = "ACM Communications in Computer Algebra"}
@String{j-ACM-J-EXP-ALGORITHMICS = "ACM Journal of Experimental Algorithmics"}
@String{j-ACTA-INFO = "Acta Informatica"}
@String{j-ADA-USER = "Ada User"}
@String{j-ALGORITHMICA = "Algorithmica"}
@String{j-ALGORITHMS-BASEL = "Algorithms ({Basel})"}
@String{j-APPL-MATH-COMP = "Applied Mathematics and Computation"}
@String{j-APPL-NUM-MATH = "Applied Numerical Mathematics: Transactions
of IMACS"}
@String{j-BYTE = "Byte Magazine"}
@String{j-C-PLUS-PLUS-REPORT = "C++ Report"}
@String{j-CACM = "Communications of the ACM"}
@String{j-CCCUJ = "C/C++ Users Journal"}
@String{j-CCPE = "Concurrency and Computation: Prac\-tice and
@String{j-CG-WORLD = "Computer Graphics World"}
@String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"}
@String{j-COMP-GRAPHICS = "Computer Graphics"}
@String{j-COMP-J = "The Computer Journal"}
@String{j-COMP-NET-AMSTERDAM = "Computer Networks (Amsterdam, Netherlands:
@String{j-COMP-PHYS-COMM = "Computer Physics Communications"}
@String{j-COMP-SURV = "ACM Computing Surveys"}
@String{j-COMP-SYS = "Computing Systems"}
@String{j-COMPUT-MATH-APPL = "Computers and Mathematics with Applications"}
@String{j-COMPUT-PHYS = "Computers in Physics"}
@String{j-COMPUT-SCI-ENG = "Computing in Science and Engineering"}
@String{j-COMPUTER = "Computer"}
@String{j-COMPUTERS-AND-GRAPHICS = "Computers and Graphics"}
@String{j-COMPUTING = "Computing"}
@String{j-CPE = "Concurrency: Prac\-tice and Experience"}
@String{j-CUJ = "C Users Journal"}
@String{j-DATAMATION = "Datamation"}
@String{j-DDJ = "Dr. Dobb's Journal of Software Tools"}
@String{j-DEC-TECH-J = "Digital Technical Journal"}
@String{j-DISTRIB-COMPUT = "Distributed Computing"}
@String{j-ELECTRONIK = "Elektronik"}
@String{j-FORM-ASP-COMPUT = "Formal Aspects of Computing"}
@String{j-FUND-INFO = "Fundamenta Informaticae"}
@String{j-FUT-GEN-COMP-SYS = "Future Generation Computer Systems"}
@String{j-HIGHER-ORDER-SYMB-COMPUT = "Higher-Order and Symbolic Computation"}
@String{j-IBM-JRD = "IBM Journal of Research and Development"}
@String{j-IBM-SYS-J = "IBM Systems Journal"}
@String{j-IEEE-CGA = "IEEE Computer Graphics and Applications"}
@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}
@String{j-IEEE-COMPUT-SCI-ENG = "IEEE Computational Science \& Engineering"}
@String{j-IEEE-CONCURR = "IEEE Concurrency"}
@String{j-IEEE-DISTRIB-SYST-ONLINE = "IEEE Distributed Systems Online"}
@String{j-IEEE-INT-SYMP-HIGH-PERF-DIST-COMP-PROC = "IEEE International Symposium
on High Performance Distributed Computing,
@String{j-IEEE-MICRO = "IEEE Micro"}
@String{j-IEEE-PAR-DIST-TECH = "IEEE parallel and distributed technology:
systems and applications"}
@String{j-IEEE-SOFTWARE = "IEEE Software"}
@String{j-IEEE-SPECTRUM = "IEEE Spectrum"}
@String{j-IEEE-TRANS-BIG-DATA = "IEEE Transactions on Big Data"}
@String{j-IEEE-TRANS-COMPUT = "IEEE Transactions on Computers"}
@String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and
Distributed Systems"}
@String{j-IEEE-TRANS-SOFTW-ENG = "IEEE Transactions on Software Engineering"}
@String{j-IEEE-TRANS-VIS-COMPUT-GRAPH = "IEEE Transactions on Visualization
and Computer Graphics"}
@String{j-IJHPCA = "The International Journal of High
Performance Computing Applications"}
@String{j-IJQC = "International Journal of Quantum Chemistry"}
@String{j-INFO-PROC-LETT = "Information Processing Letters"}
@String{j-INT-J-COMPUT-APPL = "International Journal of Computers and
@String{j-INT-J-COMPUT-SYST-SCI-ENG = "International Journal of Computer
Systems Science and Engineering"}
@String{j-INT-J-HIGH-SPEED-COMPUTING = "International Journal of High
Speed Computing (IJHSC)"}
@String{j-INT-J-PAR-EMER-DIST-SYS = "International Journal of Parallel,
Emergent and Distributed Systems: IJPEDS"}
@String{j-INT-J-PARALLEL-PROG = "International Journal of Parallel
@String{j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER = "International Journal on
Software Tools for Technology Transfer
@String{j-INTEL-TECH-J = "Intel Technology Journal"}
@String{j-J-ACM = "Journal of the ACM"}
@String{j-J-AUTOM-REASON = "Journal of Automated Reasoning"}
@String{j-J-COMP-SECUR = "Journal of Computer Security"}
@String{j-J-COMPUT-BIOL = "Journal of Computational Biology"}
@String{j-J-COMPUT-CHEM = "Journal of Computational Chemistry"}
@String{j-J-COMPUT-PHYS = "Journal of Computational Physics"}
@String{j-J-GRAPHICS-TOOLS = "Journal of Graphics Tools: JGT"}
@String{j-J-GRID-COMP = "Journal of Grid Computing"}
@String{j-J-OPEN-SOURCE-SOFT = "Journal of Open Source Software"}
@String{j-J-PAR-DIST-COMP = "Journal of Parallel and Distributed
@String{j-J-STAT-SOFT = "Journal of Statistical Software"}
@String{j-J-SUPERCOMPUTING = "The Journal of Supercomputing"}
@String{j-J-SYMBOLIC-COMP = "Journal of Symbolic Computation"}
@String{j-J-SYST-SOFTW = "The Journal of Systems and Software"}
@String{j-J-UCS = "J.UCS: Journal of Universal Computer
@String{j-JAVA-REPORT = "{Java} Report: The Source for {Java}
@String{j-JAVAWORLD = "JavaWorld: IDG's magazine for the Java
@String{j-JERIC = "ACM Journal on Educational Resources in
Computing (JERIC)"}
@String{j-JETC = "ACM Journal on Emerging Technologies in
Computing Systems (JETC)"}
@String{j-LECT-NOTES-COMP-SCI = "Lecture Notes in Computer Science"}
@String{j-LINUX-J = "Linux Journal"}
@String{j-LOGIN = ";login: the USENIX Association newsletter"}
@String{j-MICROPROC-MICROSYS = "Microprocessors and Microsystems"}
@String{j-MICROSOFT-SYS-J = "Microsoft Systems Journal"}
@String{j-NORDIC-J-COMPUT = "Nordic Journal of Computing"}
@String{j-NUMER-ALGORITHMS = "Numerical Algorithms"}
@String{j-ONLINE-CDROM-REV = "Online \& CDROM review: the international
journal of online \& optical
information systems"}
@String{j-OPEN-SYSTEMS-TODAY = "Open Systems Today"}
@String{j-OPER-SYS-REV = "Operating Systems Review"}
@String{j-PACMPL = "Proceedings of the ACM on Programming
Languages (PACMPL)"}
@String{j-PARALLEL-COMPUTING = "Parallel Computing"}
@String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing
@String{j-PARALLEL-PROCESS-LETT = "Parallel Processing Letters"}
@String{j-POMACS = "Proceedings of the ACM on Measurement and
Analysis of Computing Systems (POMACS)"}
@String{j-PROC-REAL-TIME-SYS-SYMP = "Proceedings --- Real-Time Systems
@String{j-PROC-VLDB-ENDOWMENT = "Proceedings of the VLDB Endowment"}
@String{j-QUEUE = "ACM Queue: Tomorrow's Computing Today"}
@String{j-REAL-TIME-SYST = "Real-Time Systems"}
@String{j-SCI-COMPUT-PROGRAM = "Science of Computer Programming"}
@String{j-SCI-PROG = "Scientific Programming"}
@String{j-SCPE = "Scalable Computing: Practice and Experience"}
@String{j-SIAM-J-COMPUT = "SIAM Journal on Computing"}
@String{j-SIAM-J-SCI-COMP = "SIAM Journal on Scientific Computing"}
@String{j-SIGADA-LETTERS = "ACM SIGADA Ada Letters"}
@String{j-SIGAPP = "ACM SIGAPP Applied Computing Review"}
@String{j-SIGCSE = "SIGCSE Bulletin (ACM Special Interest Group
on Computer Science Education)"}
@String{j-SIGMETRICS = "ACM SIGMETRICS Performance Evaluation
@String{j-SIGMICRO = "ACM SIGMICRO Newsletter"}
@String{j-SIGMOD = "SIGMOD Record (ACM Special Interest Group
on Management of Data)"}
@String{j-SIGPLAN = "ACM SIG{\-}PLAN Notices"}
@String{j-SIGSOFT = "ACM SIGSOFT Software Engineering Notes"}
@String{j-SPE = "Soft{\-}ware\emdash Prac{\-}tice and
@String{j-SUPERCOMPUTER = "Supercomputer"}
@String{j-TACO = "ACM Transactions on Architecture and Code
@String{j-TCBB = "IEEE/ACM Transactions on Computational
Biology and Bioinformatics"}
@String{j-TECS = "ACM Transactions on Embedded Computing
@String{j-THEOR-COMP-SCI = "Theoretical Computer Science"}
@String{j-TISSEC = "ACM Transactions on Information and System
@String{j-TIST = "ACM Transactions on Intelligent Systems and
Technology (TIST)"}
@String{j-TKDD = "ACM Transactions on Knowledge
Discovery from Data (TKDD)"}
@String{j-TOCHI = "ACM Transactions on Computer-Human
@String{j-TOCS = "ACM Transactions on Computer Systems"}
@String{j-TOCL = "ACM Transactions on Computational Logic"}
@String{j-TODAES = "ACM Transactions on Design Automation of
Electronic Systems."}
@String{j-TODS = "ACM Transactions on Database Systems"}
@String{j-TOG = "ACM Transactions on Graphics"}
@String{j-TOIS = "ACM Transactions on Information Systems"}
@String{j-TOMACS = "ACM Transactions on Modeling and
Computer Simulation"}
@String{j-TOMPECS = "ACM Transactions on Modeling and Performance
Evaluation of Computing Systems (TOMPECS)"}
@String{j-TOMS = "ACM Transactions on Mathematical Software"}
@String{j-TOPC = "ACM Transactions on Parallel Computing
@String{j-TOPLAS = "ACM Transactions on Programming Languages
and Systems"}
@String{j-TOSEM = "ACM Transactions on Software Engineering
and Methodology"}
@String{j-UNIX-REVIEW = "UNIX review"}
@String{j-UNIXWORLD-OPEN-COMP = "UnixWorld's Open Computing"}
@String{j-VLDB-J = "VLDB Journal: Very Large Data Bases"}
@String{j-WEB-TECHNIQUES = "Web Techniques"}
@String{j-X-RESOURCE = "{The X Resource}"}
@String{pub-ACM = "ACM Press"}
@String{pub-ACM:adr = "New York, NY 10036, USA"}
@String{pub-AP = "Academic Press"}
@String{pub-AP:adr = "New York, USA"}
@String{pub-APRESS = "Apress"}
@String{pub-APRESS:adr = "Berkeley, CA, USA"}
@String{pub-AW = "Ad{\-d}i{\-s}on-Wes{\-l}ey"}
@String{pub-AW:adr = "Reading, MA, USA"}
@String{pub-AWDP = "Ad{\-d}i{\-s}on-Wes{\-l}ey Developers
@String{pub-AWDP:adr = "Reading, MA, USA"}
@String{pub-EYROLLES = "Editions Eyrolles"}
@String{pub-EYROLLES:adr = "Paris, France"}
@String{pub-HERMES = "Hermes"}
@String{pub-HERMES:adr = "Paris, France"}
@String{pub-IEEE = "IEEE Computer Society Press"}
@String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver
Spring, MD 20910, USA"}
@String{pub-KLUWER = "Kluwer Academic Publishers"}
@String{pub-KLUWER:adr = "Dordrecht, The Netherlands; Boston, MA,
@String{pub-LEARNED-INF = "Learned Information"}
@String{pub-LEARNED-INF:adr = "Medford, NJ, USA"}
@String{pub-MCGRAW-HILL = "Mc{\-}Graw-Hill"}
@String{pub-MCGRAW-HILL:adr = "New York, NY, USA"}
@String{pub-MIT = "MIT Press"}
@String{pub-MIT:adr = "Cambridge, MA, USA"}
@String{pub-MORGAN-KAUFMANN = "Morgan Kaufmann Publishers"}
@String{pub-MORGAN-KAUFMANN:adr = "Los Altos, CA 94022, USA"}
@String{pub-MORGAN-KAUFMANN:adrnew = "2929 Campus Drive, Suite 260, San
Mateo, CA 94403, USA"}
@String{pub-NO-STARCH = "No Starch Press"}
@String{pub-NO-STARCH:adr = "San Francisco, CA, USA"}
@String{pub-NTIS = "National Technical Information Service"}
@String{pub-NTIS:adr = "Washington, DC, USA"}
@String{pub-ORA = "O'Reilly \& Associates, Inc."}
@String{pub-ORA:adr = "981 Chestnut Street, Newton, MA 02164, USA"}
@String{pub-ORA-MEDIA = "O'Reilly Media, Inc."}
@String{pub-ORA-MEDIA:adr = "1005 Gravenstein Highway North, Sebastopol,
CA 95472, USA"}
@String{pub-PACKT = "Packt Publishing"}
@String{pub-PACKT:adr = "Birmingham, UK"}
@String{pub-PH = "Pren{\-}tice-Hall"}
@String{pub-PH:adr = "Englewood Cliffs, NJ 07632, USA"}
@String{pub-PHI = "Pren{\-}tice-Hall International"}
@String{pub-PHI:adr = "Englewood Cliffs, NJ 07632, USA"}
@String{pub-PHPTR = "P T R Pren{\-}tice-Hall"}
@String{pub-PHPTR:adr = "Englewood Cliffs, NJ 07632, USA"}
@String{pub-SAMS = "Howard W. Sams"}
@String{pub-SAMS:adr = "Indianapolis, IN 46268, USA"}
@String{pub-SUN = "Sun Microsystems"}
@String{pub-SUN:adr = "2550 Garcia Avenue, Mountain View, CA
94043, USA"}
@String{pub-SUN-MICROSYSTEMS-PRESS = "Sun Microsystems Press"}
@String{pub-SUN-MICROSYSTEMS-PRESS:adr = "Palo Alto, CA, USA"}
@String{pub-SUNSOFT = "SunSoft Press"}
@String{pub-SUNSOFT:adr = "Mountainview, CA, USA"}
@String{pub-SV = "Spring{\-}er-Ver{\-}lag"}
@String{pub-SV:adr = "Berlin, Germany~/ Heidelberg,
Germany~/ London, UK~/ etc."}
@String{pub-UKUUG = "UK Unix Users Group"}
@String{pub-UKUUG:adr = "Buntingford, Herts, UK"}
@String{pub-USENIX = "USENIX Association"}
@String{pub-USENIX:adr = "Berkeley, CA, USA"}
@String{pub-WILEY = "John Wiley and Sons"}
@String{pub-WILEY:adr = "New York, NY, USA; London, UK; Sydney,
@String{pub-WORLD-SCI = "World Scientific Publishing Co."}
@String{pub-WORLD-SCI:adr = "Singapore; Philadelphia, PA, USA; River
Edge, NJ, USA"}
@String{ser-LNCS = "Lecture Notes in Computer Science"}
author = "C. W. Bettcher",
title = "Thread standardization and relative cost",
journal = j-COMP-ARCH-NEWS,
volume = "2",
number = "1",
pages = "9--9",
month = jan,
year = "1973",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:28 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "This is a reprint of an article published in the {\em
Journal of the Society of Automotive Engineers}, Volume
XVIII, Number 2, p. 131, February 1926, about the cost
of the lack of standardization of screw threads. {\em
Computer Architecture News\/} Editor-in-Chief Caxton C.
Foster has added a hand-written note ``of course, there
is no message here for {\em us}.''",
author = "Connie Smith and J. C. Browne",
title = "Aspects of software design analysis: {Concurrency} and
journal = j-SIGMETRICS,
volume = "9",
number = "2",
pages = "245--253",
month = "Summer",
year = "1980",
CODEN = "????",
DOI = "https://doi.org/10.1145/1009375.806169",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 10:54:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper extends previous work on development of a
methodology for the prediction of the performance of
computer software systems from design level
specifications and continuing through implementation.
The effects of synchronized behavior, such as results
from data reservation in multi-thread executions of
data base systems, and competition for host system
resources are incorporated. The previous methodology
uses hierarchical graphs to represent the execution of
software on some host computer system (or on some
abstract machine). Performance metrics such as response
time were obtained from analysis of these graphs
assuming execution of a single copy on a dedicated
host. This paper discusses the mapping of these
execution graphs upon queueing network models of the
host computing environment to yield performance metric
estimates for more complex and realistic processing
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "J. E. Jonak",
title = "Experience with a {FORTH}-like language",
journal = j-SIGPLAN,
volume = "21",
number = "2",
pages = "27--36",
month = feb,
year = "1986",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:14:55 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110 (Systems analysis and programming); C6140D
(High level languages)",
corpsource = "Sperry Network Syst., London, UK",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "FORTH; languages; programming; threaded code
pubcountry = "USA A03",
subject = "D.3.2 Software, PROGRAMMING LANGUAGES, Language
Classifications, FORTH",
treatment = "P Practical",
author = "Paul R. McJones and Garret Frederick Swart",
title = "Evolving the {UNIX} system interface to support
multithreaded programs: The {Topaz Operating System}
programmer's manual",
volume = "21",
publisher = "Digital Systems Research Center",
address = "Palo Alto, CA, USA",
pages = "100",
day = "28",
month = sep,
year = "1987",
LCCN = "QA76.76.O63M42 1987",
bibdate = "Fri Aug 7 08:29:38 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Systems Research Center",
acknowledgement = ack-nhfb,
keywords = "computer networks; Computer networks; electronic data
processing -- distributed processing; Electronic data
processing -- Distributed processing; multithreaded
operating system interface -- Topaz operating;
Operating systems (Computers); operating systems
(computers); system; UNIX (computer file); UNIX
(Computer operating system)",
author = "P. P. Tanner",
title = "Multi-thread input",
journal = j-COMP-GRAPHICS,
volume = "21",
number = "2",
pages = "142--145",
month = apr,
year = "1987",
ISSN = "0097-8930 (print), 1558-4569 (electronic)",
ISSN-L = "0097-8930",
bibdate = "Tue Mar 12 17:52:38 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Computer Graphics",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J166",
author = "P. D. Gilbert",
title = "Development of the {VAX NOTES} system",
journal = j-DEC-TECH-J,
volume = "1",
number = "6",
pages = "117--124",
month = feb,
year = "1988",
ISSN = "0898-901X",
bibdate = "Thu Mar 20 18:15:43 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6110B (Software engineering techniques); C7410F
corpsource = "Digital Equipment Corp., Hudson, MA, USA",
fjournal = "Digital Technical Journal",
keywords = "callable interface; communications tool; computer
conferencing; DEC; DEC computers; discussions; human
factors; human-factors engineering; interfaces; medium;
multiprogramming; multitasking; multithreaded server;
online; program; program testing; software engineering;
storage; technical writer; teleconferencing; testing;
user; user interface; VAX NOTES",
treatment = "P Practical",
author = "R. H. {Halstead, Jr.} and T. Fujita",
title = "{MASA}: a multithreaded processor architecture for
parallel symbolic computing",
journal = j-COMP-ARCH-NEWS,
volume = "16",
number = "2",
pages = "443--451",
month = may,
year = "1988",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:45 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Anant Agarwal",
title = "Performance tradeoffs in multithreaded processors",
number = "89-566",
institution = "Massachusetts Institute of Technology, Microsystems
Program Office",
address = "Cambridge, MA, USA",
pages = "30",
year = "1989",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "VLSI memo",
acknowledgement = ack-nhfb,
author = "M. Amamiya",
title = "Data Flow Computing and Parallel Reduction Machine",
journal = j-FUT-GEN-COMP-SYS,
volume = "4",
number = "??",
pages = "53--67",
month = "????",
year = "1989",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Wed Feb 27 18:37:19 2002",
bibsource = "ftp://ftp.ira.uka.de/bibliography/Compiler/Functional.bib;
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
keywords = "functional cell toke flow multi-thread control flow
author = "Andrew D. Birrell",
title = "An introduction to programming with threads",
type = "SRC reports",
number = "35",
institution = "Digital Systems Research Center",
address = "Palo Alto, CA, USA",
pages = "35",
day = "6",
month = jan,
year = "1989",
LCCN = "QA76.6.B5729 1989",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "parallel programming (computer science);
author = "Jean-Pierre Briot",
title = "From objects to actors: study of a limited symbiosis
in {Smalltalk-80}",
journal = j-SIGPLAN,
volume = "24",
number = "4",
pages = "69--72",
month = apr,
year = "1989",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:15:37 MST 2003",
bibsource = "Compendex database; http://portal.acm.org/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p69-briot/",
abstract = "In this paper we describe an implementation of actors
in Smalltalk-80, named Actalk. This attempt is designed
as a minimal extension preserving the Smalltalk-80
language. Actors are active and autonomous objects, as
opposed to standard passive Smalltalk-80 objects. An
actor is built from a standard Smalltalk-80 object by
associating a process with it and by serializing the
messages it could receive into a queue. We will study
the cohabitation and synergy between the two models of
computations: transfer of active messages (message and
thread of activity) between passive objects, and
exchange of passive messages between active objects. We
propose a sketch of methodology in order to have a safe
combination between these two programming paradigms.",
acknowledgement = ack-nhfb,
affiliation = "Univ Paris VI",
affiliationaddress = "Paris, Fr",
classification = "723",
conference = "Proceedings of the ACM SIGPLAN Workshop on
Object-Based Concurrent Programming",
confname = "Proceedings of the ACM SIGPLAN workshop on
Object-based concurrent programming, September 26--27
1988, San Diego, CA",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "SIGPLAN Not",
keywords = "Actor Based Systems; Computer Metatheory--Programming
Theory; Computer Programming Languages; Concurrent
Programming; Design; design; languages; Object-Based
Programming; Smalltalk-80",
meetingaddress = "San Diego, CA, USA",
meetingdate = "Sep 26--27 1988",
meetingdate2 = "09/26--27/88",
subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, Smalltalk-80. {\bf D.1.3} Software,
PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf
D.4.1} Software, OPERATING SYSTEMS, Process Management,
author = "Denis Caromel",
title = "A general model for concurrent and distributed
object-oriented programming",
journal = j-SIGPLAN,
volume = "24",
number = "4",
pages = "102--104",
month = apr,
year = "1989",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:15:37 MST 2003",
bibsource = "Compendex database; http://portal.acm.org/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p102-caromel/",
abstract = "This paper presents a general model supporting
object-oriented programming in concurrent as well as
distributed environments. The model combines the
advantages of remote procedure calls with those of
message passing. It relies on the following concepts:
All objects are not active but the active entities are
objects, Asynchronous Message Passing with Data-driven
synchronization, and Service mechanism allowing an
explicit thread of control.",
acknowledgement = ack-nhfb,
affiliation = "CNRS",
affiliationaddress = "Vandoeuvres-les-Nancy, Fr",
classification = "722; 723",
conference = "Proceedings of the ACM SIGPLAN Workshop on
Object-Based Concurrent Programming",
confname = "Proceedings of the ACM SIGPLAN workshop on
Object-based concurrent programming, September 26--27
1988, San Diego, CA",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "SIGPLAN Not",
keywords = "Computer Systems Programming; Computer Systems,
Digital--Distributed; Concurrent Programming; design;
Multiprocessing Programs; Object-Oriented Programming",
meetingaddress = "San Diego, CA, USA",
meetingdate = "Sep 26--27 1988",
meetingdate2 = "09/26--27/88",
subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
Concurrent Programming. {\bf D.1.m} Software,
PROGRAMMING TECHNIQUES, Miscellaneous. {\bf D.4.7}
Software, OPERATING SYSTEMS, Organization and Design,
Distributed systems. {\bf D.4.1} Software, OPERATING
SYSTEMS, Process Management, Concurrency.",
author = "Carlos {Carreras Vaquer}",
title = "Architecture and performance evaluation of a
multithreaded cache design",
type = "Thesis ({M.S. in Engineering})",
school = "University of Texas at Austin",
address = "Austin, TX, USA",
pages = "xii + 108",
year = "1989",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Cache memory; Computer architecture; Computer storage
devices; Integrated circuits -- Very large scale
integration; Microprocessors",
author = "Deborah L. Caswell and David L. Black",
title = "Implementing a {Mach} debugger for multithreaded
type = "Research paper",
number = "CMU-CS-89-154",
institution = "Carnegie Mellon University, Computer Science Dept.",
address = "Pittsburgh, PA, USA",
pages = "13",
month = nov,
year = "1989",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "To appear in the Conference Proceedings of Winter 1990
USENIX Technical Conference and Exhibition, Washington,
DC, January, 1990.",
abstract = "Multiple threads of control add new challenges to the
task of application debugging, and require the
development of new debuggers to meet these challenges.
This paper describes the design and implementation of
modifications to an existing debugger (gdb) for
debugging multithreaded applications under the Mach
operating system. It also describes the operating
system facilities that support it. Although certain
implementation details are specific to Mach, the
underlying design principles are applicable to other
systems that support threads in a Unix compatible
acknowledgement = ack-nhfb,
annote = "Supported by the Space and Naval Warfare Systems
keywords = "Debugging in computer science -- Computer programs",
author = "Joseph A. Korty",
title = "{Sema}: a {Lint-like} Tool for Analyzing Semaphore
Usage in a Multithreaded {UNIX} Kernel",
crossref = "USENIX:1989:PWU",
institution = "MODCOMP",
pages = "113--123",
month = "Winter",
year = "1989",
bibdate = "Wed Aug 13 10:48:45 MDT 1997",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "MODCOMP",
author = "H. Massalin and C. Pu",
title = "Threads and input\slash output in the synthesis
journal = j-OPER-SYS-REV,
volume = "23",
number = "5",
pages = "191--201",
month = dec,
year = "1989",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 12:47:29 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Paul R. McJones and Garret F. Swart",
title = "Evolving the {UNIX} System Interface to Support
Multithreaded Programs",
crossref = "USENIX:1989:PWU",
pages = "393--404",
month = "Winter",
year = "1989",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "DEC Systems Research Center",
author = "Kevin Brian Plyler",
title = "Adding multithreaded capabilities to the process
manager of the {BIGSAM} distributed operating system",
type = "Thesis ({M.S.})",
school = "Arizona State University",
address = "Tempe, AZ, USA",
pages = "x + 105 + 2",
year = "1989",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Electronic data processing -- Distributed processing;
Multiprocessors; Operating systems (Computers)",
author = "R. Rashid and R. Baron and A. Forin and D. Golub and
M. Jones and D. Orr and R. Sanzi",
title = "{Mach}: a foundation for open systems (operating
crossref = "IEEE:1989:WOS",
pages = "109--113",
year = "1989",
bibdate = "Sat Sep 28 20:21:01 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Sch. of Comput. Sci., Carnegie-Mellon Univ.,
Pittsburgh, PA, USA",
classification = "C6110B (Software engineering techniques); C6150J
(Operating systems)",
keywords = "Hardware resources; Mach kernel; Multiserver Unix;
Multithreaded Unix server; Operating system; OS
emulation; Software development",
thesaurus = "File servers; Open systems; Operating systems
[computers]; Software engineering; Unix",
author = "Edith Schonberg",
title = "On-the-fly detection of access anomalies",
journal = j-SIGPLAN,
volume = "24",
number = "7",
pages = "285--297",
month = jul,
year = "1989",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:15:41 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/73141/index.html;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/73141/p285-schonberg/",
abstract = "Access anomalies are a common class of bugs in
shared-memory parallel programs. An access anomaly
occurs when two concurrent execution threads both write
(or one thread reads and the other writes) the same
shared memory location without coordination. Approaches
to the detection of access anomalies include static
analysis, post-mortem trace analysis, and on-the-fly
monitoring. A general on-the-fly algorithm for access
anomaly detection is presented, which can be applied to
programs with both nested fork-join and synchronization
operations. The advantage of on-the-fly detection over
post-mortem analysis is that the amount of storage used
can be greatly reduced by data compression techniques
and by discarding information as soon as it becomes
obsolete. In the algorithm presented, the amount of
storage required at any time depends only on the number
V of shared variables being monitored and the number N
of threads, not on the number of synchronizations. Data
compression is achieved by the use of two techniques
called merging and subtraction. Upper bounds on storage
are shown to be V \$MUL N${}^2$ for merging and V \$MUL
N for subtraction.",
acknowledgement = ack-nhfb,
affiliationaddress = "New York, NY, USA",
annote = "Published as part of the Proceedings of PLDI'89.",
classification = "722; 723",
conference = "Proceedings of the SIGPLAN '89 Conference on
Programming Language Design and Implementation",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "SIGPLAN Not",
keywords = "Access Anomalies; algorithms; Computer Operating
Systems; Computer Programming Languages--Design;
Computer Systems, Digital--Parallel Processing;
languages; Parallel Programs; Program Processors",
meetingaddress = "Portland, OR, USA",
meetingdate = "Jun 21--23 1989",
meetingdate2 = "06/21--23/89",
sponsor = "ACM, Special Interest Group on Programming Languages,
New York; SS NY, USA",
subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
Concurrent Programming. {\bf D.3.2} Software,
PROGRAMMING LANGUAGES, Language Classifications, Ada.
{\bf D.2.2} Software, SOFTWARE ENGINEERING, Design
Tools and Techniques, Flow charts.",
author = "D. Caswell and D. Black",
title = "Implementing a {Mach} debugger for multithreaded
crossref = "Anonymous:1990:PWU",
pages = "25--39",
year = "1990",
bibdate = "Sat Sep 28 20:03:34 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
affiliation = "Hewlett Packard Labs., Palo Alto, CA, USA",
classification = "C6150G (Diagnostic, testing, debugging and
evaluating systems); C6150J (Operating systems)",
keywords = "Application debugging; Mach debugger; Mach operating
system; Multithreaded applications; Operating system
facilities; Underlying design principles; Unix
compatible environment",
thesaurus = "Operating systems [computers]; Program debugging;
author = "Gregory Colvin",
title = "{CUG306} Thread and Synapsys",
journal = j-CUJ,
volume = "8",
type = "CUG New Release",
number = "3",
pages = "131--??",
month = mar,
year = "1990",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Gregory Colvin",
title = "Multitasking With Lightweight Threads",
journal = j-CUJ,
volume = "8",
number = "3",
pages = "55--??",
month = mar,
year = "1990",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "S. J. Eggers and David R. Keppel and Eric J. Koldinger
and Henry M. Levy",
title = "Techniques for efficient inline tracing on a
shared-memory multiprocessor",
journal = j-SIGMETRICS,
volume = "18",
number = "1",
pages = "37--47",
month = may,
year = "1990",
CODEN = "????",
DOI = "https://doi.org/10.1145/98457.98501",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:09:08 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "While much current research concerns multiprocessor
design, few traces of parallel programs are available
for analyzing the effect of design trade-offs. Existing
trace collection methods have serious drawbacks:
trap-driven methods often slow down program execution
by more than 1000 times, significantly perturbing
program behavior; microcode modification is faster, but
the technique is neither general nor portable. This
paper describes a new tool, called MPTRACE, for
collecting traces of multithreaded parallel programs
executing on shared-memory multiprocessors. MPTRACE
requires no hardware or microcode modification; it
collects complete program traces; it is portable; and
it reduces execution-time dilation to less than a
factor 3. MPTRACE is based on inline tracing, in which
a program is automatically modified to produce trace
information as it executes. We show how the use of
compiler flow analysis techniques can reduce the amount
of data collected and therefore the runtime dilation of
the traced program. We also discuss problematic issues
concerning buffering and writing of trace data on a
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "John E. Faust and Henry M. Levy",
title = "The performance of an object-oriented threads
journal = j-SIGPLAN,
volume = "25",
number = "10",
pages = "278--288",
month = oct,
year = "1990",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:15:57 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Dean W. Gonzalez",
title = "Multitasking Software Components",
journal = j-SIGADA-LETTERS,
volume = "10",
number = "1",
pages = "92--96",
month = jan # "\slash " # feb,
year = "1990",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Thu Sep 28 07:33:23 MDT 2000",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
classcodes = "C6110B (Software engineering techniques); C6120 (File
fjournal = "ACM SIGADA Ada Letters",
keywords = "Ada; Ada parameter passing semantics; concurrency,
tasking, reuse; concurrent forms; data integrity; data
structure manipulation routines; data structures;
multiple; parallel programming; reusability; semaphore
calls; software; threads of control",
treatment = "P Practical",
author = "G. J. Hansen and C. A. Linthicum and G. Brooks",
title = "Experience with a performance analyzer for
multithreaded applications",
crossref = "IEEE:1990:PSN",
pages = "124--131",
year = "1990",
bibdate = "Wed Apr 15 18:34:48 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C5470 (Performance evaluation and testing); C6150E
(General utility programs); C6150G (Diagnostic,
testing, debugging and evaluating systems)",
corpsource = "CONVEX Comput. Corp., Richardson, TX, USA",
keywords = "CONVEX C200 series computers; Convex OS V8.0; CONVEX
performance analyzer, CX/sub pa/; loops;
multiprocessing systems; multithreaded applications;
operating system facilities; parallel code monitoring;
performance evaluation; profiling data; profiling
information; time-sharing environment; time-sharing
systems; Unix; UNIX based operating system",
sponsororg = "IEEE; ACM; Lawrence Livermore Nat. Lab.; Los Alamos
Nat. Lab.; NASA Ames Res. Center; Nat. Center Atmos.
Res.; NSF; SIAM; Supercomput. Res. Center",
treatment = "P Practical; X Experimental",
author = "Stan Miastkowski",
title = "{PC GUIs} Go Head to Head",
journal = j-BYTE,
volume = "15",
number = "11",
pages = "82--87",
month = "Fall",
year = "1990",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Thu Sep 12 18:39:30 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6130B (Graphics techniques); C6150J (Operating
systems); C6180 (User interfaces)",
fjournal = "BYTE Magazine",
keywords = "Graphical DOS shell; Multithreading operating system;
OS/2; PC GUIs; User interface differences; Windows
thesaurus = "Computer graphics; Operating systems [computers]; User
author = "D. J. Nordstrom",
title = "Threading {Lisp}",
journal = j-SIGPLAN,
volume = "25",
number = "2",
pages = "17--24",
month = feb,
year = "1990",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:15:50 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "C. D. (Constantine D.) Polychronopoulos",
title = "Auto scheduling: control flow and data flow come
type = "Technical Report",
number = "CSRD 1058",
institution = inst-UIUC-CSRD,
address = inst-UIUC-CSRD:adr,
pages = "28",
month = dec,
year = "1990",
bibdate = "Fri Aug 30 08:01:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper presents a framework we term
auto-scheduling, which brings together the control flow
and data flow models by combining most of the
advantages and excluding the major disadvantages of the
two familiar models. Auto-scheduling can be viewed
either as an abstract architectural model or as a
parallel program compilation framework. While in
ordinary environments parallel task creation and
scheduling is done by the operating system, or at best
the run-time library, in auto-scheduling task creation
and scheduling is performed by the user program itself,
making parallel processing affordable at
fine-granularity levels. Under auto-scheduling the
compiler does not only generate object code, but it
`lends' its knowledge about a program to the parallel
instruction threads of that program, allowing them to
manage, activate, and schedule themselves at run-time,
without the need of an external monitor. This is done
by means of special drive-code injected by the compiler
to each schedulable unit of a program (task, thread,
etc). We argue that auto-scheduling offers an optimal
approach for exploiting parallelism on real parallel
computer systems.",
acknowledgement = ack-nhfb,
annote = "Title on P. 1: Auto-scheduling: control flow and data
flow come together. Supported in part by the National
Science Foundation. Supported in part by the U.S.
Department of Energy. Supported in part by Digital
Equipment Corporation.",
keywords = "Parallel processing (Electronic computers); Scheduling
author = "D. L. Presotto",
booktitle = "UKUUG. UNIX - The Legend Evolves. Proceedings of the
Summer 1990 UKUUG Conference",
title = "Multiprocessor Streams for {Plan 9}",
publisher = pub-UKUUG,
address = pub-UKUUG:adr,
pages = "11--19 (of xi + 260)",
month = "????",
year = "1990",
ISBN = "0-9513181-7-9",
ISBN-13 = "978-0-9513181-7-1",
LCCN = "????",
bibdate = "Sat Mar 22 15:10:17 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6150J (Operating systems)",
conflocation = "London, UK; 9-13 July 1990",
corpsource = "AT&T Bell Lab., Murray Hill, NJ, USA",
keywords = "abstraction; input-output programs; kernel;
multi-threaded; multiprocessing programs;
multiprocessor; Plan 9 kernel; Streams; system call
interface; Unix",
treatment = "P Practical",
author = "Rafael H. Saavedra-Barrera and David E. Culler and
Thorsten {Von Eiken}",
title = "Analysis of multithreaded architectures for parallel
type = "Report",
number = "UCB/CSD 90/569",
institution = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "10",
month = apr,
year = "1990",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "To appear in the 2nd Annual ACM Symposium on Parallel
Algorithms and Architectures, Crete, Greece, July
abstract = "Multithreading has been proposed as an architectural
strategy for tolerating latency in multiprocessors and,
through limited empirical studies, shown to offer
promise. This paper develops an analytical model of
multithreaded processor behavior based on a small set
of architectural and program parameters. The model
gives rise to a large Markov chain, which is solved to
obtain a formula for processor efficiency in terms of
the number of threads per processor, the remote
reference rate, the latency, and the cost of switching
between threads. It is shown that a multithreaded
processor exhibits three operating regimes: linear
(efficiency is proportional to the number of threads),
transition, and saturation (efficiency depends only on
the remote reference rate and switch cost). Formulae
for regime boundaries are derived. The model is
embellished to reflect cache degradation due to
multithreading, using an analytical model of cache
behavior, demonstrating that returns diminish as the
number threads becomes large. Predictions from the
embellished model correlate well with published
empirical measurements. Prescriptive use of the model
under various scenarios indicates that multithreading
is effective, but the number of useful threads per
processor is fairly small.",
acknowledgement = ack-nhfb,
annote = "Supported in part by NASA. Supported in part by the
National Science Foundation through the UCB Mammoth
keywords = "Computer architecture; Multiprocessors",
author = "David A. Schmitt",
title = "{C} Extensions For Multi-Threading",
journal = j-CUJ,
volume = "8",
number = "8",
pages = "33--??",
month = aug,
year = "1990",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Joseph Francis Stapleton",
title = "Dynamic server selection in a multithreaded network
computing environment",
type = "Thesis ({M.S.})",
school = "Iowa State University",
address = "Ames, IA, USA",
pages = "66",
year = "1990",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Anant Agarwal",
title = "Performance tradeoffs in multithreaded processors",
type = "Technical report",
number = "MIT/LCS/TR 501; VLSI memo no. 89-566",
institution = "Laboratory for Computer Science, Massachusetts
Institute of Technology",
address = "Cambridge, MA, USA",
pages = "39",
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "R. Balter and J. Bernadat and D. Decouchant and A.
Duda and A. Freyssinet and S. Krakowiak and M.
Meysembourg and P. Le Dot and H. Nguyen Van and E.
Paire and M. Riveill and C. Roison and X. Rousset de
Pina and R. Scioville and G. Vand{\^o}me",
title = "Architecture and Implementation of Guide, an
Object-Oriented Distributed System",
journal = j-COMP-SYS,
volume = "4",
number = "1",
pages = "31--67",
month = "Winter",
year = "1991",
ISSN = "0895-6340",
bibdate = "Fri Sep 13 08:51:08 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
classification = "C6110 (Systems analysis and programming); C6150J
(Operating systems)",
fjournal = "Computing Systems",
keywords = "Class; Distributed object memory; Dynamic links;
Execution structures; Execution units; Grenoble
Universities integrated distributed environment; Guide;
Job sharing; Language; Multi-threaded virtual machines;
Nodes; Object model; Object-oriented distributed
operating system; Persistent objects storage; Single
inheritance; Synchronized objects; Synchronized
transactions; Type; UNIX",
thesaurus = "Distributed processing; Object-oriented programming;
Operating systems [computers]",
author = "A. J. M. Beddow",
title = "Multi-Threaded {C} Functions",
journal = j-CUJ,
volume = "9",
number = "1",
pages = "57--??",
month = jan,
year = "1991",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "D. Bolinger and S. Mangalat",
title = "Parallelizing signal handling and process management
in {OSF/1}",
crossref = "USENIX:1991:PUM",
pages = "105--122",
year = "1991",
bibdate = "Sat Sep 28 19:47:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Encore Computer Corp., Marlborough, MA, USA",
classification = "C6110P (Parallel programming); C6150J (Operating
keywords = "Mach kernel; Multi-threaded programming model;
Multi-threaded tasks; Multiprocessor-efficient; OSF/1
operating system; Parallelization; Performance
improvements; Process management; Races; Signal
handling; Synchronization problems; System calls; Unix
emulation; Unix process-oriented abstractions",
thesaurus = "Interrupts; Operating systems [computers]; Parallel
programming; Unix",
author = "R. Canetti and L. P. Fertig and S. A. Kravitz and D.
Malki and R. Y. Pinter and S. Porat and A. Teperman",
title = "The parallel {C} ({pC}) programming language",
journal = j-IBM-JRD,
volume = "35",
number = "5/6",
pages = "727--741",
month = sep # "\slash " # nov,
year = "1991",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Tue Mar 25 14:26:59 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The authors describe pC (parallel C), an extension of
the ANSI C programming language to support medium- to
large-grain parallel programming in both shared- and
distributed-memory environments. pC aims to make
programming for parallel processors accessible to the C
community by enriching the C programming model with a
small set of constructs supporting parallelism. pC
supports shared- and distributed-memory environments
via a hierarchical computational model. A pC
application comprises a static collection of tasks with
disjoint memory spaces. A dynamic collection of threads
runs within each task, sharing the data and code of the
task. Language constructs specify concurrent execution
of threads within a single task. Additional language
constructs specify the interactions between threads
through the following mechanisms: initiation of threads
in remote tasks by remote function call, mailbox-based
message passing, and synchronization primitives. The
paper introduces the computational model and language
constructs of pC and describes a prototype pC compiler
and run-time system for the Mach operating system.
Several program examples illustrate the utility of pC
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. Sci., Technion-Israel Inst. of
Technol., Haifa, Israel",
classcodes = "C6140D (High level languages); C6110P (Parallel
programming); C6150C (Compilers, interpreters and other
classification = "C6110P (Parallel programming); C6140D (High level
languages); C6150C (Compilers, interpreters and other
corpsource = "Dept. of Comput. Sci., Technion-Israel Inst. of
Technol., Haifa, Israel",
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
keywords = "ANSI C programming language; C language; C
programming; C programming model; Disjoint memory
spaces; disjoint memory spaces; Distributed-memory;
distributed-memory; function call; Hierarchical
computational model; hierarchical computational model;
Language constructs; language constructs; Mach; Mach
operating system; Mailbox-based message passing;
mailbox-based message passing; model; operating system;
Parallel C; parallel C; parallel languages; Parallel
programming; parallel programming; Parallelism;
parallelism; PC; pC; PC compiler; pC compiler; program
compilers; remote; Remote function call; Run-time
system; run-time system; Shared memory; shared memory;
Synchronization; synchronization; Tasks; tasks;
Threads; threads",
thesaurus = "C language; Parallel languages; Program compilers",
treatment = "P Practical",
author = "W.-M. Ching and D. Ju",
title = "Execution of automatically parallelized {APL} programs
on {RP3}",
journal = j-IBM-JRD,
volume = "35",
number = "5/6",
pages = "767--777",
month = sep # "\slash " # nov,
year = "1991",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Tue Mar 25 14:26:59 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The authors have implemented an experimental APL/C
compiler, which accepts ordinary APL programs and
produces C programs. They have also implemented a
run-time environment that supports the parallel
execution of these C programs on the RP3 computer, a
shared-memory, 64-way MIMD machine built at the IBM
Thomas J. Watson Research Center. The APL/C compiler
uses the front end of the APL/370 compiler and imposes
the same restrictions, but requires no parallelization
directives from the user. The run-time environment is
based on simple synchronization primitives and is
implemented using Mach threads. They report the
speedups of several compiled programs running on RP3
under the Mach operating system. The current
implementation exploits only data parallelism. They
discuss the relationship between the style of an APL
program and its expected benefit from the automatic
parallel execution provided by the compiler.",
acknowledgement = ack-nhfb,
affiliation = "IBM Thomas J. Watson Res. Center, Yorktown Heights,
classcodes = "C6150C (Compilers, interpreters and other processors);
C6150N (Distributed systems); C6140D (High level
classification = "C6140D (High level languages); C6150C (Compilers,
interpreters and other processors); C6150N (Distributed
corpsource = "IBM Thomas J. Watson Res. Center, Yorktown Heights,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
keywords = "APL; APL/370 compiler; APL/C; APL/C compiler;
Automatically parallelized APL programs; automatically
parallelized APL programs; C language; C programs;
compiler; compilers; Data parallelism; data
parallelism; Mach operating; Mach operating system;
Mach threads; multiprocessing programs; program; RP3;
Shared-memory; shared-memory; synchronisation;
Synchronization primitives; synchronization primitives;
thesaurus = "APL; C language; Multiprocessing programs; Program
compilers; Synchronisation",
treatment = "P Practical",
author = "Tzi-cker Chiueh",
title = "Multi-threaded vectorization",
journal = j-COMP-ARCH-NEWS,
volume = "19",
number = "3",
pages = "352--361",
month = may,
year = "1991",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:01 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "David E. Culler and Anurag Sah and Klaus E. Schauser
and Thorsten von Eicken and John Wawrzynek",
title = "Fine-grain parallelism with minimal hardware support:
a compiler-controlled threaded abstract machine",
journal = j-COMP-ARCH-NEWS,
volume = "19",
number = "2",
pages = "164--175",
month = apr,
year = "1991",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:40 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "David E. Culler and Anurag Sah and Klaus E. Schauser
and Thorsten von Eicken and John Wawrzynek",
title = "Fine-Grain Parallelism with Minimal Hardware Support:
a Compiler-Controlled Threaded Abstract Machine",
journal = j-SIGPLAN,
volume = "26",
number = "4",
pages = "164--175",
month = apr,
year = "1991",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat May 01 18:50:04 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "David E. Culler and Anurag Sah and Klaus E. Schauser
and Thorsten von Eicken and John Wawrzynek",
title = "Fine-grain parallelism with minimal hardware support:
a compiler-controlled threaded abstract machine",
journal = j-OPER-SYS-REV,
volume = "25",
number = "3S",
pages = "164--175",
month = apr,
year = "1991",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 15:24:15 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Richard P. Draves and Brian N. Bershad and Richard F.
Rashid and Randall W. Dean",
title = "Using continuations to implement thread management and
communication in operating systems",
journal = j-OPER-SYS-REV,
volume = "25",
number = "5",
pages = "122--136",
month = oct,
year = "1991",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:57 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Roger Faulkner and Ron Gomes",
title = "The Process File System and Process Model in {UNIX
System V}",
crossref = "USENIX:1991:PWU",
pages = "243--252",
year = "1991",
bibdate = "Mon Jan 02 08:29:13 2017",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://obits.mlive.com/obituaries/grandrapids/obituary.aspx?pid=180588279;
abstract = "We describe the process file system {\bf /proc} in
UNIX System V Release 4 and its relationship to the
UNIX process model abstraction. {\bf /proc} began as a
debugger interface superseding {\em ptrace(2)\/} but
has evolved into a general interface to the process
model. It provides detailed process information and
control mechanisms that are independent of operating
system implementation details and portable to a large
class of real architectures. Control is thorough.
Processes can be stopped and started on demand and can
be instructed to stop on events of interest: specific
machine faults, specific signals, and entry to or exit
from specific system calls. Complete encapsulation of a
process's execution environment is possible, as well as
non-intrusive inspection. Breakpoint debugging is
relieved from the ambiguities of signals. Security
provisions are complete and non-destructive.\par
The addition of multi-threading to the process model
motivates a proposal for a substantial change to the
{\bf /proc} interface that would replace the
single-level flat structure with a hierarchy of
directories containing status and control files. This
restructuring would eliminate all {\em ioctl(2)\/}
operations in favor of {\em read(2)\/} and {\em
write(2)\/} operations, which generalize more easily to
networks .",
acknowledgement = ack-nhfb,
author-dates = "Roger Faulkner (8 April 1940--2 July 2016)",
author = "Bill O. Gallmeister and Chris Lanier",
title = "Early experience with {POSIX 1003.4} and {POSIX
1003.4 A}",
pages = "190--198 (of ix + 307)",
year = "1991",
ISBN = "0-8186-2450-7",
ISBN-13 = "978-0-8186-2450-6",
LCCN = "QA 76.54 R43 1991",
bibdate = "Mon Dec 22 09:06:02 1997",
bibsource = "Compendex database;
note = "IEEE catalog number 91CH3090-8.",
abstract = "Two proposed IEEE standards for real-time operating
systems support, POSIX.4 and POSIX.4a, are proceeding
towards IEEE approval and will eventually become
international standards. The authors provide a brief
overview of the facilities of POSIX.4 and POSIX.4a.
They concentrate on a few of the critical features that
POSIX.4 and POSIX.4a provide and describe the POSIX.4
scheduling interface. The POSIX.4a support for multiple
threads of control is also described. The features
found in POSIX.4 and POSIX.4a for synchronization of
multiple threads, are discussed, and the POSIX.4
interprocess communication facility is presented. The
performance numbers are given to allow comparisons of
the facilities of traditional UNIX systems, the
facilities of a representative hard real-time system
(LynxOS), and the facilities of POSIX.4 and POSIX.4a.",
acknowledgement = ack-nhfb,
classification = "722; 723; 902",
conference = "Proceedings of the 12th Real-Time Systems Symposium",
conferenceyear = "1991",
fjournal = "Proceedings --- Real-Time Systems Symposium",
journalabr = "Proc Real Time Syst Symp",
keywords = "Computer Operating Systems--Standards; Computer
Systems, Digital; POSIX.4a Standards; Real Time
Operation; Real-Time Operating Systems",
meetingaddress = "San Antonio, TX, USA",
meetingdate = "Dec 4--6 1991",
meetingdate2 = "12/04--06/91",
publisherinfo = "IEEE Service Center",
sponsor = "IEEE Computer Soc",
author = "Ray R. Glenn",
title = "Characterizing memory hot spots in a shared memory
{MIMD} machine",
type = "Technical report",
number = "SRC-TR-91-039",
institution = inst-SRC-IDA,
address = inst-SRC-IDA:adr,
pages = "24",
day = "15",
month = oct,
year = "1991",
bibdate = "Fri Aug 30 08:01:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper analyzes two memory hot spot problems
associated with massively parallel MIMD computers. The
first is the memory stride problem, which is similar to
stride problems found in existing supercomputers. The
second hot spot problem occurs in designs that use two
separate memory accesses to lock and unlock critical
sections (split transaction) and employ a first
come/first serve queuing mechanism for shared memory
locations. A bistability in throughput brought about by
these conditions is analyzed and experimentally
demonstrated. Simple equations are presented which
predict the throughput at a critical section of code as
a function of the number of applied threads. In
particular, the mean size of the work items that can be
executed in parallel without the possibility of
stalling is proportional to the square of the number of
threads applied.",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors",
author = "H. Hirata and Y. Mochizuki and A. Nishimura and Y.
title = "A Multithreaded Processor Architecture with
Simultaneous Instruction Issuing",
crossref = "Anonymous:1991:PIS",
pages = "87--96",
year = "1991",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "T. Hironaka and T. Hashimoto and K. Okazaki and K.
title = "A Single-Chip Vector-Processor Prototype Based on
Multithreaded Streaming\slash {FIFO} ({MSFV})
crossref = "Anonymous:1991:PIS",
pages = "77--86",
year = "1991",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Susumu Horiguchi and Takeo Nakada",
title = "Performance Evaluation of Parallel Fast {Fourier}
Transform on a Multiprocessor Workstation",
journal = j-J-PAR-DIST-COMP,
volume = "12",
number = "2",
pages = "158--163",
month = jun,
year = "1991",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Apr 12 17:13:17 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C4190 (Other numerical methods); C4240 (Programming
and algorithm theory); C5440 (Multiprocessor systems
and techniques)",
corpsource = "Dept. of Inf. Sci., Tohoku Univ., Sendai, Japan",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "algorithms; cache protocols; fast Fourier transform;
fast Fourier transforms; FFT; floating-; multiprocess
operating system; multiprocessing systems;
multiprocessor workstation; multithread operating
system; operating systems; parallel; parallel FFT;
performance; performance evaluation; point
treatment = "P Practical",
author = "H. H. J. Hum and G. R. Gao",
title = "A Novel High-Speed Memory Organization for Fine-Grain
Multi-Thread Computing",
journal = j-LECT-NOTES-COMP-SCI,
volume = "505",
pages = "34--??",
year = "1991",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 08:51:55 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "W. F. Jolitz and L. G. Jolitz",
title = "Porting {UNIX} to the 386. {The} basic kernel
Multiprogramming and multitasking. {II}",
journal = j-DDJ,
volume = "16",
number = "10",
pages = "62, 64, 66, 68, 70, 72, 118--120",
month = oct,
year = "1991",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 09:11:02 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
classification = "C6110 (Systems analysis and programming); C6150J
(Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "386BSD kernel; Multiple simultaneous process
execution; Multiprogramming; Multitasking; Multithread
operations; Operating systems; Porting; Sleep( ); Swch(
); Switching mechanisms; UNIX; Wakeup( )",
thesaurus = "C listings; Microprocessor chips; Multiprogramming;
Software portability; Unix",
author = "Michael B. Jones",
title = "Bringing the {C} Libraries with Us into a
Multi-Threaded Future",
crossref = "USENIX:1991:PWU",
pages = "81--92",
day = "21--25",
month = jan,
year = "1991",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "Carnegie Mellon University",
author = "Wolfgang K{\"u}chlin",
title = "On the multi-threaded computation of integral
polynomial greatest common divisors",
crossref = "Watt:1991:IPI",
pages = "333--342",
year = "1991",
bibdate = "Thu Mar 12 08:38:03 MST 1998",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/120694/p333-kuchlin/",
abstract = "Reports experiences and practical results from
parallelizing the Brown--Collins polynomial g.c.d.
algorithm, starting from Collins' SAC-2 implementation
IPGCDC. The parallelization environment is PARSAC-2, a
multi-threaded version of SAC-2 programmed in C with
the parallelization constructs of the C Threads
library. IPGCDC computes the g.c.d. and its co-factors
of two polynomials in $ Z(x_1, \ldots {}, x_r) $, by
first reducing the problem to multiple calculations of
modular polynomial g.c.d.'s in $ Z_p(x_1, \ldots {},
x_r) $, and then recovering the result by Chinese
remaindering. After studying timings of the SAC-2
algorithm, the author first parallelizes the Chinese
remainder algorithm, and then parallelizes the main
loop of IPGCDC by executing the modular g.c.d.
computations concurrently. Finally, he determines
speed-up's and speed-up efficiencies of our parallel
algorithms over a wide range of polynomials. The
experiments were conducted on a 12 processor Encore
Multimax under Mach.",
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. and Inf. Sci., Ohio State Univ.,
Columbus, OH, USA",
classification = "C4240 (Programming and algorithm theory); C7310
keywords = "algorithms; Brown--Collins polynomial g.c.d.
algorithm; Chinese remaindering; Encore Multimax;
Multi-threaded computation; PARSAC-2; Polynomial
greatest common divisors",
subject = "{\bf G.1.0} Mathematics of Computing, NUMERICAL
ANALYSIS, General, Parallel algorithms. {\bf F.2.1}
PROBLEM COMPLEXITY, Numerical Algorithms and Problems,
Computations on polynomials. {\bf I.1.0} Computing
General. {\bf I.1.3} Computing Methodologies, SYMBOLIC
AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf
D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, C.",
thesaurus = "Mathematics computing; Parallel algorithms; Symbol
author = "G. Malan and R. Rashid and D. Golub and R. Baron",
title = "{DOS} as a {Mach 3.0} application",
crossref = "USENIX:1991:PUM",
pages = "27--40",
year = "1991",
bibdate = "Sat Sep 28 19:47:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Sch. of Comput. Sci., Carnegie Mellon Univ.,
Pittsburgh, PA, USA",
classification = "C6150J (Operating systems); C7430 (Computer
keywords = "Common DOS functions; Common DOS software; DOS
functionality; DOS operating system; Frequently loaded
DOS drivers; High memory area; High-speed space combat
simulation system; I/O devices; I386/i486 architecture;
Latency demands; Mach features; Machine-dependent
kernel modifications; Multiple virtual DOS
environments; Multithreaded emulation; PC architecture;
Performance sensitive PC entertainment software;
Timing; Unix emulation; Unix Server; VGA display;
Virtual 8086 mode; Virtual machine environment; Wing
thesaurus = "IBM computers; Microcomputer applications; Supervisory
programs; Unix; Virtual machines",
author = "Richard F. Man",
title = "A Multithreading Library In {C} For Subsumption
journal = j-CUJ,
volume = "9",
number = "11",
pages = "42--??",
month = nov,
year = "1991",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Brian D. Marsh and Michael L. Scott and Thomas J.
LeBlanc and Evangelos P. Markatos",
title = "First-class user-level threads",
journal = j-OPER-SYS-REV,
volume = "25",
number = "5",
pages = "110--121",
month = oct,
year = "1991",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:57 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Lawrence Mennemeier",
title = "Hardware mechanisms to support concurrent threads on
{RISC} and superscalar multiprocessors",
type = "Thesis ({M.S.})",
school = "University of California, Santa Cru",
pages = "vii + 39",
year = "1991",
LCCN = "QA76.5.M44 1991",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Masters theses -- University of California, Santa Cruz
-- 1991; multiprocessors; parallel processing
(electronic computers)",
author = "Gregory M. Papadopoulos and Kenneth R. Traub",
title = "Multithreading: a revisionist view of dataflow
journal = j-COMP-ARCH-NEWS,
volume = "19",
number = "3",
pages = "342--351",
month = may,
year = "1991",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:01 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Won Woo Park",
title = "Performance-area trade-offs in multithreaded
processing unit",
type = "Thesis ({Ph.D.})",
school = "University of Texas at Austin",
address = "Austin, TX, USA",
pages = "xvii + 165",
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Multiprocessors; Parallel
processing (Electronic computers)",
author = "Thuan Quang Pham",
title = "The experimental migration of a distributed
application to a multithreaded environment",
type = "Thesis ({M.S.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "51",
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "M. Krish Ponamgi and Wenwey Hseush and Gail E.
title = "Debugging Multithreaded Programs with {MPD}",
journal = j-IEEE-SOFTWARE,
volume = "8",
number = "3",
pages = "37--43",
month = may,
year = "1991",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Sat Jan 25 07:35:26 MST 1997",
bibsource = "Compendex database;
Misc/IMMD_IV.bib; Parallel/debug_3.1.bib",
acknowledgement = ack-nhfb,
affiliation = "Dept of Comput Sci, Columbia Univ, New York, NY, USA",
classification = "723",
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
journalabr = "IEEE Software",
keywords = "Computer Programming; Computer Systems, Digital ---
Multiprocessing; Event Recognition; Multiprocessor
Debugger; Multithreaded Software; Pattern Recognition;
Program Debugging",
author = "M. L. Powell and S. R. Kleiman and S. Barton and D.
Shah and D. Stein and M. Weeks",
title = "{SunOS} Multi-thread Architecture",
crossref = "USENIX:1991:PWU",
institution = "Sun Microsystems, Inc.",
pages = "65--80",
day = "21--25",
month = jan,
year = "1991",
bibdate = "Wed Aug 13 10:48:45 MDT 1997",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "Sun Microsystems, Inc.",
author = "Scott Richman",
title = "Examining the {Hamilton C} shell ({Unix} power for
journal = j-DDJ,
volume = "16",
number = "1",
pages = "98, 100, 102, 104--106",
month = jan,
year = "1991",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 09:11:02 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Doug Hamilton's C Shell helps you create more powerful
OS/2 programs.",
acknowledgement = ack-nhfb,
classification = "C6115 (Programming support); C6150E (General utility
programs); C6150J (Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "C shell environment; C++ programs; High-performance
file system; Large command lines; Long filenames; OS/2
features; Pipes; Presentation Manager; Script language;
Script program; Shell scripts; Text windows; Threads;
thesaurus = "C listings; Software packages; Software tools; Utility
author = "Rafael H. Saavedra-Barrera and David E. Culler",
title = "An analytical solution for a {Markov} chain modeling
multithreaded execution",
type = "Report",
number = "UCB/CSD 91/623",
institution = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "24",
month = apr,
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreading is an architectural technique aimed at
maintaining high processor utilization in the presence
of large memory or interprocessor communication
latency. While waiting for a remote reference to
complete, the processor switches to another execution
thread. Several realizations of this concept have been
proposed, but little data is available on the actual
costs and benefits. This paper presents an analytical
model of multithreaded execution, which may serve to
guide and explain empirical studies. The model is based
on three key parameters: thread run-length, switch
cost, and latency. A closed-form expression for
processor utilization is obtained for deterministic and
stochastic run-lengths. The derivation involves
identifying specific patterns in the very large set of
equations forming the Markov chain. Using this result,
three operating regimes are identified for a
multithreaded processor subject to long latencies:
linear, where utilization is proportional to the number
of threads per processor, saturation, where utilization
is determined only by the run-length and switch cost,
and transition between the other regimes. The model can
be used to estimate the effects of several
architectural variations.",
acknowledgement = ack-nhfb,
annote = "Supported in part by NASA under consortium agreement
NCA2-128 and cooperative agreement NCC2-550. Supported
in part by the National Science Foundation.",
keywords = "Computer architecture; Markov chains",
author = "Klaus Erik Schauser and David E. Culler and Thorsten
{von Eicken}",
title = "Compiler-Controlled Multithreading for Lenient
Parallel Languages",
journal = j-LECT-NOTES-COMP-SCI,
volume = "523",
pages = "50--??",
year = "1991",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 08:51:55 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Klaus Erik Schauser",
title = "Compiling dataflow into threads: efficient
compiler-controlled multithreading for lenient parallel
type = "Thesis ({M.S.})",
school = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "71",
day = "2",
month = jul,
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Also available as Report UCB/CSD 91/644",
abstract = "Powerful non-strict parallel languages require fast
dynamic scheduling. This thesis explores how the need
for multithreaded execution can be addressed as a
compilation problem, to achieve switching rates
approaching what hardware mechanisms might provide.
Compiler-controlled multithreading is examined through
compilation of a lenient parallel language, ID90, for a
threaded abstract machine, TAM. A key feature of TAM is
that synchronization is explicit and occurs only at the
start of a thread, so that a simple cost model can be
applied. A scheduling hierarchy allows the compiler to
schedule logically related threads closely together in
time and to use registers across threads. Remote
communication is via message sends and split-phase
memory accesses. Messages and memory replies are
received by compiler-generated message handlers which
rapidly integrate these events with thread scheduling.
To compile ID90 for TAM, we employ a new parallel
intermediate form, dual-graphs, with distinct control
and data arcs. This provides a clean framework for
partitioning the program into threads, scheduling
threads, and managing registers under asynchronous
execution. The compilation process is described and
preliminary measurements of the effectiveness of the
approach are discussed. Previous to this work,
execution of Id90 programs was limited to specialized
architectures or dataflow graph interpreters. By
compiling via TAM, we have achieved more than two
orders of magnitude performance improvement over graph
interpreters on conventional machines, making this Id90
implementation competitive with machines supporting
dynamic instruction scheduling in hardware. Timing
measurements show that our Id90 implementation on a
standard RISC can achieve a performance close to Id90
on one processor of the recent dataflow machine
Monsoon. It can be seen that the TAM partitioning
presented in this thesis reduces the control overhead
substantially and that more aggressive partitioning
would yield modest additional benefit. There is,
however, considerable room for improvement in
scheduling and register management.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science Foundation.
Supported in part by Motorola Inc., the TRW Foundation,
and the International Computer Science Institute",
keywords = "Compilers (Computer programs); Parallel programming
(Computer science)",
author = "Klaus Erik Schauser and David E. Culler and Thorsten
{von Eicken}",
title = "Compiler-controlled multithreading for lenient
parallel languages",
type = "Report",
number = "UCB/CSD 91/640",
institution = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "21",
day = "30",
month = jul,
year = "1991",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "A version of this report is to appear in the
Proceedings of FPCA '91 Conference on Functional
Programming Languages and Computer Architecture, Aug.
1991, Springer-Verlag",
abstract = "Tolerance to communication latency and inexpensive
synchronization are critical for general-purpose
computing on large multiprocessors. Fast dynamic
scheduling is required for powerful nonstrict parallel
languages. However, machines that support rapid
switching between multiple execution threads remain a
design challenge. This paper explores how multithreaded
execution can be addressed as a compilation problem, to
achieve switching rates approaching what hardware
mechanisms might provide. Compiler-controlled
multithreading is examined through compilation of a
lenient parallel language, Id90, for a threaded
abstract machine, TAM. A key feature of TAM is that
synchronization is explicit and occurs only at the
start of a thread, so that a simple cost model can be
applied. A scheduling hierarchy allows the compiler to
schedule logically related threads closely together in
time and to use registers across threads. Remote
communication is via message sends and split-phase
memory accesses. Messages and memory replies are
received [sic] by compiler-generated message handlers
which rapidly integrate these events with thread
scheduling. To compile Id90 for TAM, we employ a new
parallel intermediate form, dual-graphs, with distinct
control and data arcs. This provides a clean framework
for partitioning the program into threads, scheduling
threads, and managing registers under asynchronous
execution. The compilation process is described and
preliminary measurements of its effectiveness are
discussed. Dynamic execution measurements are obtained
via a second compilation step, which translates TAM
into native code for existing machines with
instrumentation incorporated. These measurements show
that the cost of compiler-controlled multithreading is
within a small factor of the cost of control flow in
sequential languages.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science Foundation
PYI Award. Supported in part by Motorola Inc., the TRW
Foundation and the Semiconductor Research Corporation
Supported in part by J. Wawrzynek's PYI Award.
Supported in part by NSF Infrastructure Grant.",
keywords = "Compilers (Computer programs); Parallel programming
(Computer science)",
author = "Karsten Schwan and Hongyi Zhou and Ahmed Gheith",
title = "Real-time threads",
journal = j-OPER-SYS-REV,
volume = "25",
number = "4",
pages = "35--46",
month = oct,
year = "1991",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:51 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Thomas G. Speer and Mark W. Storm",
title = "{Digital}'s Transaction Processing Monitors",
journal = j-DEC-TECH-J,
volume = "3",
number = "1",
pages = "18--32",
month = "Winter",
year = "1991",
ISSN = "0898-901X",
bibdate = "Thu Mar 20 18:15:43 MST 1997",
bibsource = "/usr/local/src/bib/bibliography/Database/Graefe.bib;
URL = "ftp://ftp.digital.com/pub/Digital/info/DTJ/v3n1/Digitals_Transaction_Processi_01oct1991DTJ102P8.ps;
abstract = "Digital provides two transaction processing (TP)
monitor products --- ACMS (Application Control and
Management System) and DECintact (Integrated
Application Control). Each monitor is a unified set of
transaction processing services for the application
environment. These services are layered on the VMS
operating system. Although there is a large functional
overlap between the two, both products achieve similar
goals by means of some significantly different
implementation strategies. Flow control and
multithreading in the ACMS monitor is managed by means
of a fourth-generation language (4GL) task definition
language. Flow control and multithreading in the
DECintact monitor is managed at the application level
by third-generation language (3GL) calls to a library
of services. The ACMS monitor supports a deferred task
model of queuing, and the DECintact monitor supports a
message-based model. Over time, the persistent
distinguishing feature between the two monitors will be
their different application programming inter faces.",
acknowledgement = ack-nhfb,
affiliation = "Digital Equipment Corp., Maynard, MA, USA",
classcodes = "C6150J (Operating systems)",
classification = "C6150J (Operating systems)",
corpsource = "Digital Equipment Corp., Maynard, MA, USA",
fjournal = "Digital Technical Journal",
keywords = "ACMS; Application; Application Control; Application
Control and Management System; Application programming
interfaces; application programming interfaces; Control
and Management System; DECintact; Digital; Integrated;
Integrated Application Control; message-based model;
Message-based model; monitors; Monitors;
Multithreading; multithreading; Queuing; queuing;
supervisory programs; task definition language; Task
definition language; transaction processing;
Transaction processing; transaction processing; VMS
operating system",
thesaurus = "Supervisory programs; Transaction processing",
treatment = "P Practical",
author = "Kenneth R. Traub",
title = "Multi-thread Code Generation for Dataflow
Architectures from Non-Strict Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "523",
pages = "73--??",
year = "1991",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 08:51:55 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Anant Agarwal",
title = "Performance tradeoffs in multithreaded processors",
volume = "3",
number = "5",
pages = "525--539",
month = sep,
year = "1992",
DOI = "https://doi.org/10.1109/71.159037",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Apr 11 15:20:39 MDT 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Lab for Comput Sci, MIT, Cambridge, MA, USA",
classification = "722.1; 722.4; C4230M (Multiprocessor
interconnection); C4240P (Parallel programming and
algorithm theory); C5220P (Parallel architecture);
C5320G (Semiconductor storage); C5440 (Multiprocessor
systems and techniques); C5470 (Performance evaluation
and testing); C6120 (File organisation)",
corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
journalabr = "IEEE Trans Parallel Distrib Syst",
keywords = "buffer storage; cache interference; Cache memories;
caches; contention; context-switching overhead;
data-sharing; Digital storage; interconnection
networks; Interconnection networks; multiprocessing
systems; multiprocessor; multithreaded processors;
network; network bandwidth; parallel; parallel
algorithms; Parallel processing systems; Performance;
Performance analysis; performance evaluation; Pipeline
processing systems; programming; storage management;
switching theory",
treatment = "P Practical; T Theoretical or Mathematical",
author = "G. A. Alverson and R. Alverson and D. Callahan and B.
title = "Exploiting Heterogeneous Parallelism on a
Multi-threaded Multiprocessor",
crossref = "ACM:1992:CPI",
pages = "188--197",
year = "1992",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Thomas E. Anderson and Brian N. Bershad and Edward D.
Lazowska and Henry M. Levy",
title = "Scheduler Activations: Effective Kernel Support for
the User-Level Management of Parallelism",
journal = j-TOCS,
volume = "10",
number = "1",
pages = "53--79",
month = feb,
year = "1992",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p53-anderson/",
abstract = "{\em Threads\/} are the vehicle for concurrency in
many approaches to parallel programming. Threads can be
supported either by the operating system kernel or by
user-level library code in the application address
space, but neither approach has been fully
satisfactory. This paper addresses this dilemma. First,
we argue that the performance of kernel threads is {\em
inherently\/} worse than that of user-level threads,
rather than this being an artifact of existing
implementations; managing parallelism at the user level
is essential to high-performance parallel computing.
Next, we argue that the problems encountered in
integrating user-level threads with other system
services is a consequence of the lack of kernel support
for user-level threads provided by contemporary
multiprocessor operating systems; kernel threads are
the {\em wrong abstraction\/} on which to support
user-level management of parallelism. Finally, we
describe the design, implementation, and performance of
a new kernel interface and user-level thread package
that together provide the same functionality as kernel
threads without compromising the performance and
flexibility advantages of user-level management of
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "design; measurement; performance",
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management, Scheduling. {\bf D.4.4} Software, OPERATING
SYSTEMS, Communications Management, Input/output. {\bf
D.4.7} Software, OPERATING SYSTEMS, Organization and
Design. {\bf D.4.8} Software, OPERATING SYSTEMS,
author = "Anonymous",
title = "It's a Multithreaded World, Part 1: Multithreaded
operating systems are becoming the norm. {Here}'s how
your applications can exploit them",
journal = j-BYTE,
volume = "17",
number = "5",
pages = "289--??",
month = may,
year = "1992",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Tue Jan 2 10:01:41 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "BYTE Magazine",
author = "Anonymous",
title = "It's a Multithreaded World, Part 2: Multithreaded
operating systems are taking over. {Are} your
applications ready?",
journal = j-BYTE,
volume = "17",
number = "6",
pages = "351--??",
month = jun,
year = "1992",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Tue Jan 2 10:01:41 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "BYTE Magazine",
author = "Prakash Arunachalam",
title = "Evaluation of a multithreaded microprocessor with
{MIPS R3000} instruction set",
type = "Thesis ({M.S. in Engineering})",
school = "University of Texas at Austin",
address = "Austin, TX, USA",
pages = "vii + 45",
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; MIPS R3000 series
microprocessors; Parallel processing (Electronic
computers); Reduced instruction set computers; RISC
author = "Barr E. Bauer",
title = "Parallel {C} extensions",
journal = j-DDJ,
volume = "17",
number = "8",
pages = "110, 112--114, 124, 127",
month = aug,
year = "1992",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 10:06:23 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
affiliation = "Schering-Plough Res. Inst., Bloomfield, NJ, USA",
classification = "C6110P (Parallel programming); C6140D (High level
languages); C6150C (Compilers, interpreters and other
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "C extensions; C programs; Parallel execution regions;
Parallel execution threads; Parallelized program;
Serial program; Silicon Graphics IRIS Power C
thesaurus = "C language; C listings; Parallel languages; Program
author = "Brian N. Bershad and David D. Redell and John R.
title = "Fast mutual exclusion for uniprocessors",
journal = j-SIGPLAN,
volume = "27",
number = "9",
pages = "223--233",
month = sep,
year = "1992",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:26 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/143365/p223-bershad/",
abstract = "In this paper we describe restartable atomic
sequences, an {\em optimistic\/} mechanism for
implementing simple atomic operations (such as {\em
Test-And-Set\/}) on a uniprocessor. A thread that is
suspended within a restartable atomic sequence is
resumed by the operating system at the beginning of the
sequence, rather than at the point of suspension. This
guarantees that the thread eventually executes the
sequence {\em atomically\/}. A restartable atomic
sequence has significantly less overhead than other
software-based synchronization mechanisms, such as
kernel emulation or software reservation. Consequently,
it is an attractive alternative for use on
uniprocessors that do no support atomic operations.
Even on processors that do support atomic operations in
hardware, restartable atomic sequences can have lower
overhead. We describe different implementations of
restartable atomic sequences for the Mach 3.0 and Taos
operating systems. These systems' thread management
packages rely on atomic operations to implement
higher-level mutual exclusion facilities. We show that
improving the performance of low-level atomic
operations, and therefore mutual exclusion mechanisms,
improves application performance.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "design; languages; measurement; performance",
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management, Mutual exclusion.",
author = "Robert D. (Robert David) Blumofe",
title = "Managing storage for multithreaded computations",
type = "Thesis ({M.S.})",
school = "Massachusetts Institute of Technology, Laboratory for
Computer Science, Department of Electrical Engineering
and Computer Science",
address = "Cambridge, MA, USA",
pages = "83",
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Also available as Report MIT/LCS/TR 552.",
acknowledgement = ack-nhfb,
author = "Bob Boothe and Abhiram Ranade",
title = "Improved multithreading techniques for hiding
communication latency in multiprocessors",
journal = j-COMP-ARCH-NEWS,
volume = "20",
number = "2",
pages = "214--223",
month = may,
year = "1992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "G. Cattaneo and G. Di Giore and M. Ruotolo",
title = "Another {C} Threads Library",
journal = j-SIGPLAN,
volume = "27",
number = "12",
pages = "81--90",
month = dec,
year = "1992",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:30 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Indranil Chowdhury",
title = "Performance evaluation and architecture of an
instruction cache for multithreaded {RISC} processor",
type = "Thesis ({M.S. in Engineering})",
school = "University of Texas at Austin",
address = "Austin, TX, USA",
pages = "x + 93",
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Cache memory -- Evaluation -- Simulation methods;
Computer architecture; Microprocessors; Reduced
instruction set computers",
author = "David E. Culler and Michial Gunter and James C. Lee",
title = "Analysis of multithreaded microprocessors under
type = "Report",
number = "UCB/CSD 92/687",
institution = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "17",
month = may,
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreading has been proposed as a means of
tolerating long memory latencies in multiprocessor
systems. Fundamentally, it allows multiple concurrent
subsystems (cpu, network, and memory) to be utilized
simultaneously. This is advantageous on uniprocessor
systems as well, since the processor is utilized while
the memory system services misses. We examine
multithreading on high-performance uniprocessors as a
means of achieving better cost/performance on multiple
processes. Processor utilization and cache behavior are
studied both analytically and through simulation of
timesharing and multithreading using interleaved
reference traces. Multithreading is advantageous when
one has large on-chip caches (32 kilobytes),
associativity of two, and a memory access cost of
roughly 50 instruction times. At this point, a small
number of threads (2-4) is sufficient, the thread
switch need not be extraordinarily fast, and the memory
system need support only one or two outstanding misses.
The increase in processor real-estate to support
multithreading is modest, given the size of the cache
and floating-point units. A surprising observation is
that miss ratios may be lower with multithreading than
with timesharing under a steady-state load. This occurs
because switch-on-miss multithreading introduces unfair
thread scheduling, giving more CPU cycles to processes
with better cache behavior.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science Foundation.
Supported in part by Motorola Inc. and the TRW
keywords = "Microprocessors; Multiprogramming (Electronic
author = "David E. Culler and Michial Gunter and James C. Lee",
title = "Analysis of multithreaded microprocessors under
journal = j-COMP-ARCH-NEWS,
volume = "20",
number = "2",
pages = "438--438",
month = may,
year = "1992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Michael Day",
title = "Implementing {NLM-Based} Client\slash Server
journal = j-DDJ,
volume = "17",
number = "10",
pages = "78--84",
month = oct,
year = "1992",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:34 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "NetWare NLMs take full advantage of the multitasking,
multithreaded architecture of the operating system.
Michael presents a distributed file manager made up of
two modules: ENGINE.NLM, an NLM running on a NetWare
3.x server, and CLIENT.EXE, a DOS-based front end
running on the client.",
acknowledgement = ack-nhfb,
classification = "C6150N (Distributed systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "32-Bit protected-mode programs; Client/server
architectures; Distributed file manager; DOS-based
front end; Multitasking; Multithreaded architecture;
NetWare 3.x operating system; Netware Loadable Modules;
Networked system",
thesaurus = "Distributed processing; File servers",
author = "Michael Day",
title = "Implementing {NLM-Based} Client\slash Server
journal = j-DDJ,
volume = "17",
number = "10",
pages = "78--84",
month = oct,
year = "1992",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:34 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "NetWare NLMs take full advantage of the multitasking,
multithreaded architecture of the operating system.
Michael presents a distributed file manager made up of
two modules: ENGINE.NLM, an NLM running on a NetWare
3.x server, and CLIENT.EXE, a DOS-based front end
running on the client.",
acknowledgement = ack-nhfb,
classification = "C6150N (Distributed systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "32-Bit protected-mode programs; Client/server
architectures; Distributed file manager; DOS-based
front end; Multitasking; Multithreaded architecture;
NetWare 3.x operating system; Netware Loadable Modules;
Networked system",
thesaurus = "Distributed processing; File servers",
author = "Erik H. D'Hollander",
title = "Partitioning and labeling of loops by unimodular
volume = "3",
number = "4",
pages = "465--476",
month = jul,
year = "1992",
DOI = "https://doi.org/10.1109/71.149964",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
MRclass = "68Q10 (68Q22)",
MRnumber = "93f:68030",
bibdate = "Mon Apr 14 07:37:07 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Dept of Electr Eng, State Univ of Ghent, Belgium",
classification = "722; 723; C4240P (Parallel programming and algorithm
theory); C6110P (Parallel programming); C6150C
(Compilers, interpreters and other processors)",
corpsource = "Dept. of Electr. Eng., State Univ. of Ghent, Belgium",
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
journalabr = "IEEE Trans Parallel Distrib Syst",
keywords = "computational complexity; Computer Programming ---
Algorithms; Computer Systems Programming; constant
dependence vectors; dependence matrix; dependent
iterations; do-loops; fold nested loop; independent
subsets; invariant dependence; join; labelling
algorithm; loop labelling; loop partitioning;
Multiprocessing Programs; multithreaded dynamic
scheduling; n-; parallel; parallel algorithms; parallel
DO-ALL loops; partitioning algorithm; Partitioning
Algorithms; primitive; program compilers; Program
Transformations; programming; programming theory;
relation; scheduling; serial loop; transformation;
unimodular; Unimodular Transformations; unimodular
treatment = "T Theoretical or Mathematical",
author = "Douglas Dale Donalson",
title = "{DISC}: a dynamic performance evaluation of a
multi-thread architecture",
type = "Thesis ({M.S.})",
school = "Electrical and Computer Engineering Department,
University of California, Santa Barbara",
address = "Santa Barbara, CA, USA",
pages = "ix + 88",
year = "1992",
LCCN = "TK174.C2 S25 DOND 1992",
bibdate = "Sat Apr 20 11:18:53 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "J. R. Eykholt and S. R. Kleiman and S. Barton and R.
Faulkner and D. Stein and M. Smith and A. Shivalingiah
and J. Voll and M. Weeks and D. Williams",
title = "Beyond Multiprocessing: Multithreading the {System V
Release} 4 Kernel",
crossref = "USENIX:1992:PSU",
pages = "11--18",
month = "Summer",
year = "1992",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "SunSoft Inc.",
author = "Edward W. Felten and Dylan James McNamee",
title = "Improving the performance of message-passing
applications by multithreading",
type = "Technical report",
number = "92-09-07",
institution = "University of Washington, Dept. of Computer Science
and Engineering",
address = "Seattle, WA, USA",
pages = "6",
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Achieving maximum performance in message-passing
programs requires that calculation and communication be
overlapped. However, the program transformations
required to achieve this overlap are error-prone and
add significant complexity to the application program.
We argue that calculation/communication overlap can be
achieved easily and consistently by executing multiple
threads of control on each processor, and that this
approach is practical on message-passing architectures
without any special hardware support. We present timing
data for a typical message-passing application, to
demonstrate the advantages of our scheme.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science Foundation.
Supported in part by the Washington Technology Center,
Digital Equipment Corporation, Apple Computer Company,
a Mercury Seven Fellowship and an AT\&T Ph.D.
keywords = "Operating systems",
author = "Maya B. Gokhale and William W. Carlson",
title = "An introduction to compilation issues for parallel
type = "Technical report",
number = "SRC-TR-92-062",
institution = inst-SRC-IDA,
address = inst-SRC-IDA:adr,
pages = "38",
day = "8",
month = sep,
year = "1992",
bibdate = "Fri Aug 30 08:01:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The exploitation of today's high-performance computer
systems requires the effective use of parallelism in
many forms and at numerous levels. This survey article
discusses program analysis and restructuring techniques
that target parallel architectures. We first describe
various categories of architectures that are oriented
toward parallel computation models: vector
architectures, shared memory multiprocessors, massively
parallel machines, message-passing architectures,
VLIWs, and multithreaded architectures. We then
describe a variety of optimization techniques that can
be applied to sequential programs to effectively
utilize the vector and parallel processing units. After
an overview of basic dependence analysis, we present
restructuring transformations on DO loops targeted both
to vectorization and to concurrent execution,
interprocedural and pointer analysis, task scheduling,
instruction level parallelization, and
compiler-assisted data placement. We conclude that
although tremendous advances have been made in
dependence theory and in the development of a `toolkit'
of transformations, parallel systems are used most
effectively when the programmer interacts in the
optimization process.",
acknowledgement = ack-nhfb,
keywords = "Compilers (Computer programs); Computer architecture;
Parallel processing (Electronic computers)",
author = "R. Govindarajan and S. S. Nemawarkar",
title = "A Large Context Multithreaded Architecture",
journal = j-LECT-NOTES-COMP-SCI,
volume = "634",
pages = "423--??",
year = "1992",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:46:24 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Matt Haines and Anton Pedro Willem Bohm",
title = "Software multithreading in a conventional distributed
memory multiprocessor",
type = "Technical report",
number = "CS-92-126",
institution = "Colorado State University, Dept. of Computer Science",
address = "Fort Collins, CO, USA",
pages = "25",
day = "25",
month = sep,
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Today's most powerful computers are distributed memory
multiprocessors. Although they possess massive amounts
of available resources, it is often difficult to
exploit these resources efficiently. Compilers that can
cope with the complexities of these systems are being
constructed, but their scope of effect is often limited
due to the complexity of the analysis and the lack of
runtime information. Novel architectures that can
better tolerate latencies are under construction, but
their effectiveness is unproven, and they do little to
ease the burden on current commercial machines.
Therefore we are designing a runtime system, called
VISA, that attempts to avoid and tolerate latencies on
conventional distributed memory multiprocessors, as
well as provide a single addressing space to ease the
burden of programming or code generation. The goal of
our runtime system is to serve as a tool for studying
the effects of latency avoidance and latency tolerance
on programs running on these conventional
architectures. In this paper we describe the design and
implementation of multithreading in the VISA runtime
system for the purpose of latency tolerance. In
particular, we examine machine-independent designs for
thread representation, thread switching, and
split-phased transactions. We quantify the cost of
multithreading for our environment, present a test
program for which multithreading degrades performance,
and present a program for which multithreading enhances
acknowledgement = ack-nhfb,
annote = "Supported in part by a grant from Sandia National
keywords = "Multiprocessors",
author = "Steve Halladay and Michael Wiebel",
title = "A Practical Use For Multiple Threads",
journal = j-CUJ,
volume = "10",
number = "1",
pages = "73--??",
month = jan,
year = "1992",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine
and Yoshiyuki Mochizuki and Akio Nishimura and
Yoshimori Nakase and Teiji Nishizawa",
title = "An elementary processor architecture with simultaneous
instruction issuing from multiple threads",
journal = j-COMP-ARCH-NEWS,
volume = "20",
number = "2",
pages = "136--145",
month = may,
year = "1992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "H. Hirata and Y. Mochizuki and A. Nishmura and Y.
Nakase and T. Nishizawa",
title = "A multithreaded processor architecture with
simultaneous instruction issuing",
journal = j-SUPERCOMPUTER,
volume = "9",
number = "3",
pages = "23--39",
month = may,
year = "1992",
ISSN = "0168-7875",
bibdate = "Wed Mar 18 08:37:01 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
affiliation = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka,
classification = "C5220P (Parallel architecture); C6110P (Parallel
programming); C6150J (Operating systems)",
corpsource = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka,
fjournal = "Supercomputer",
keywords = "functional unit; independent instruction streams;
multiprogramming; multithreaded processor architecture;
parallel processing; scheduling; simultaneous
instruction issuing; vector machines; VLW machines",
pubcountry = "Netherlands",
treatment = "P Practical",
author = "T. Hironaka and T. Hashimoto and K. Okazaki and K.
title = "Benchmarking a Vector-Processor Prototype Based on
Multithreaded Streaming\slash {FIFO} Vector ({MSFV})
crossref = "ACM:1992:CPI",
pages = "272--281",
year = "1992",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Herbert H. J. Hum and Guang R. Gao",
title = "A high-speed memory organization for hybrid
dataflow\slash {von Neumann} computing",
journal = j-FUT-GEN-COMP-SYS,
volume = "8",
number = "4",
pages = "287--301",
month = sep,
year = "1992",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Fri Jul 15 09:06:02 MDT 2005",
bibsource = "ftp://ftp.ira.uka.de/bibliography/Os/threads.bib;
abstract = "The paper proposes a novel organization of high-speed
memories, known as the register-cache, for a
multi-threaded architecture. Viewed from the execution
unit, its contents are addressable as ordinary CPU
registers using relatively short addresses. From the
main memory perspective, it is content addressable. In
this register-cache organization, a number of registers
are grouped into a block of registers where a register
in a block is accessed using an offset from the address
of the block, an offset value which is embedded in the
compiler generated code. The binding of register block
locations to register-cache line addresses is
adaptively performed at runtime, thus resulting in a
dynamically allocated register file. In this execution
model, a program is compiled into a number of
instruction threads called super-actors. A super-actor
becomes ready for execution only when its input data
are physically residing in the register-cache and space
is reserved in the register-cache to store its
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Suresh Jagannathan and Jim Philbin",
title = "A customizable substrate for concurrent languages",
journal = j-SIGPLAN,
volume = "27",
number = "7",
pages = "55--67",
month = jul,
year = "1992",
ISBN = "0-89791-475-9",
ISBN-13 = "978-0-89791-475-8",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
LCCN = "QA76.7.S53 1992",
bibdate = "Sun Dec 14 09:16:22 MST 2003",
bibsource = "Compendex database;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p55-jagannathan/",
abstract = "We describe an approach to implementing a wide-range
of concurrency paradigms in high-level (symbolic)
programming languages. The focus of our discussion is
STING, a dialect of Scheme, that supports lightweight
threads of control and virtual processors as
first-class objects. Given the significant degree to
which the behavior of these objects may be customized,
we can easily express a variety of concurrency
paradigms and linguistic structures within a common
framework without loss of efficiency. Unlike parallel
systems that rely on operating system services for
managing concurrency, STING implements concurrency
management entirely in terms of Scheme objects and
procedures. It, therefore, permits users to optimize
the runtime behavior of their applications without
requiring knowledge of the underlying runtime system.
This paper concentrates on (a) the implications of the
design for building asynchronous concurrency
structures, (b) organizing large-scale concurrent
computations, and (c) implementing robust programming
environments for symbolic computing.",
acknowledgement = ack-nhfb,
affiliation = "NEC Research Inst",
affiliationaddress = "Princeton, NJ, USA",
annote = "Published as part of the Proceedings of PLDI'92.",
classification = "723.1",
conference = "Proceedings of the ACM SIGPLAN '92 Conference on
Programming Language Design and Implementation",
conferenceyear = "1992",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "SIGPLAN Not",
keywords = "algorithms; Computer programming languages;
Concurrency paradigms; Concurrency structures; design;
languages; Parallel processing systems; performance;
Robust programming; Symbolic programming languages",
meetingaddress = "San Francisco, CA, USA",
meetingdate = "Jun 17--19 1992",
meetingdate2 = "06/17--19/92",
sponsor = "ACM",
subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, Concurrent, distributed, and parallel
languages. {\bf D.3.2} Software, PROGRAMMING LANGUAGES,
Language Classifications, SCHEME. {\bf D.1.3} Software,
PROGRAMMING TECHNIQUES, Concurrent Programming,
Parallel programming.",
author = "Philip J. {Koopman, Jr.} and Peter Lee and Daniel P.
title = "Cache Behavior of Combinator Graph Reduction",
journal = j-TOPLAS,
volume = "14",
number = "2",
pages = "265--297",
month = apr,
year = "1992",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Sat Jan 06 14:28:31 1996",
bibsource = "Compiler/Compiler.Lins.bib;
Compiler/garbage.collection.bib; Compiler/Heaps.bib;
note = "Also see~\cite{Koopman:1992:CBC}.",
URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/128867.html",
abstract = "The results of cache-simulation experiments with an
abstract machine for reducing combinator graphs are
presented. The abstract machine, called TIGRE, exhibits
reduction rates that, for similar kinds of combinator
graphs on similar kinds of hardware, compare favorably
with previously reported techniques. Furthermore, TIGRE
maps easily and efficiently onto standard computer
architectures, particularly those that allow a
restricted form of self-modifying code. This provides
some indication that the conventional ``stored
program'' organization of computer systems is not
necessarily an inappropriate one for functional
programming language implementations.\par
This is not to say, however, that present day computer
systems are well equipped to reduce combinator graphs.
In particular, the behavior of the cache memory has a
significant effect on performance. In order to study
and quantify this effect, trace-driven cache
simulations of a TIGRE graph reducer running on a
reduced instruction-set computer are conducted. The
results of these simulations are presented with the
following hardware-cache parameters varied: cache size,
block size, associativity, memory update policy, and
write-allocation policy. To begin with, the cache
organization of a commercially available system is used
and then the performance sensitivity with respect to
variations of each parameter are measured. From the
results of the simulation study, a conclusion is made
that combinator-graph reduction using TIGRE runs most
efficiently when using a cache memory with an
allocate-on-write-miss strategy, moderately large block
size (preferably with subblock placement), and
copy-back memory updates.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "algorithms; languages; performance; theory;
sjb = "In amongst all the cache stuff is a description of how
subroutine threading can form the basis for a
relatively efficient method of performing combinator
graph reduction.",
subject = "{\bf B.3.2}: Hardware, MEMORY STRUCTURES, Design
Styles, Cache memories. {\bf B.3.3}: Hardware, MEMORY
STRUCTURES, Performance Analysis and Design Aids,
Simulation. {\bf D.1.1}: Software, PROGRAMMING
TECHNIQUES, Applicative (Functional) Programming. {\bf
D.3.2}: Software, PROGRAMMING LANGUAGES, Language
Classifications, Applicative languages. {\bf D.3.4}:
Software, PROGRAMMING LANGUAGES, Processors, Compilers.
{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
Processors, Interpreters. {\bf G.2.1}: Mathematics of
Computing, DISCRETE MATHEMATICS, Combinatorics.",
author = "W. Kuchlin",
title = "On the Multi-Threaded Computation of Modular
Polynomial Greatest Common Divisors",
journal = j-LECT-NOTES-COMP-SCI,
volume = "591",
pages = "369--??",
year = "1992",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:46:24 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Philip Lenir and R. Govindarajan and S. S.
title = "Exploiting instruction-level parallelism: the
multithreaded approach",
journal = j-SIGMICRO,
volume = "23",
number = "1--2",
pages = "189--192",
month = dec,
year = "1992",
DOI = "https://doi.org/10.1145/144965.145798",
bibdate = "Fri Apr 16 10:27:43 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/144965.145798",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMICRO Newsletter",
journal-URL = "https://dl.acm.org/loi/sigmicro",
author = "T. {Le Sergent} and B. Berthomieu",
title = "Incremental Multi-Threaded Garbage Collection on
Virtually Shared Memory Architectures",
journal = j-LECT-NOTES-COMP-SCI,
volume = "637",
pages = "179--??",
year = "1992",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:46:24 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "R. S. Nikhil and G. M. Papadopoulos and Arvind",
title = "{T}: a multithreaded massively parallel architecture",
journal = j-COMP-ARCH-NEWS,
volume = "20",
number = "2",
pages = "156--167",
month = may,
year = "1992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kazuhiro Ogata and Satoshi Kurihara and Mikio Inari
and Norihisa Doi",
title = "The design and implementation of {HoME}",
journal = j-SIGPLAN,
volume = "27",
number = "7",
pages = "44--54",
month = jul,
year = "1992",
ISBN = "0-89791-475-9",
ISBN-13 = "978-0-89791-475-8",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
LCCN = "QA76.7.S53 1992",
bibdate = "Sun Dec 14 09:16:22 MST 2003",
bibsource = "Compendex database;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p44-ogata/",
abstract = "HoME is a version of Smalltalk which can be
efficiently executed on a multiprocessor and can be
executed in parallel by combining a Smalltalk process
with a Mach thread and executing the process on the
thread. HoME is nearly the same as ordinary Smalltalk
except that multiple processes may execute in parallel.
Thus, almost all applications running on ordinary
Smalltalk can be executed on HoME without changes in
their code. HoME was designed and implemented based on
the following fundamental policies: (1) theoretically,
an infinite number of processes can become active; (2)
the moment a process is scheduled, it becomes active;
(3) no process switching occurs; (4) HoME is equivalent
to ordinary Smalltalk except for the previous three
policies. The performance of the current implementation
of HoME running on OMRON LUNA-88K, which had four
processors, was measured by benchmarks which execute in
parallel with multiple processes. In all benchmarks,
the results showed that HoME's performance is much
better than HPS on the same workstation.",
acknowledgement = ack-nhfb,
affiliation = "Keio Univ",
affiliationaddress = "Yokohama, Jpn",
annote = "Published as part of the Proceedings of PLDI'92.",
classification = "723.1",
conference = "Proceedings of the ACM SIGPLAN '92 Conference on
Programming Language Design and Implementation",
conferenceyear = "1992",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "SIGPLAN Not",
keywords = "Computer programming; design; HPS on Mach environment;
languages; measurement; Object oriented programming;
performance; Smalltalk",
meetingaddress = "San Francisco, CA, USA",
meetingdate = "Jun 17--19 1992",
meetingdate2 = "06/17--19/92",
sponsor = "ACM",
subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
Concurrent Programming. {\bf D.3.4} Software,
PROGRAMMING LANGUAGES, Processors. {\bf D.3.2}
Classifications, Smalltalk. {\bf D.2.8} Software,
SOFTWARE ENGINEERING, Metrics, Performance measures.",
author = "G. M. Papadopoulos and A. P. W. Bohm and A. T. Dahbura
and R. R. Oldehoeft",
title = "Multithreaded computer systems",
crossref = "IEEE:1992:PSM",
pages = "772--775",
year = "1992",
bibdate = "Wed Apr 15 15:37:20 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture)",
corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
keywords = "architectural principles; data matching; multithreaded
computer systems; parallel architectures; parallel
machines; split-phase memory accesses",
sponsororg = "IEEE; ACM",
treatment = "P Practical",
author = "J. Kent Peacock and Sunil Saxena and Dean Thomas and
Fred Yang and Wilfred Yu",
title = "Experiences from Multithreading System {V} Release 4",
crossref = "USENIX:1992:SED",
pages = "77--92",
day = "26--27",
month = mar,
year = "1992",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "Intel Multiprocessor Consortium",
author = "J. Kent Peacock",
title = "File System Multithreading in {System V Release} 4
crossref = "USENIX:1992:PSU",
pages = "19--30",
month = "Summer",
year = "1992",
bibdate = "Tue Feb 20 15:42:13 MST 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "Intel Multi-Processor Consortium",
author = "Thuan Q. Pham and Pankaj K. Garg",
title = "On Migrating a Distributed Application to a
Multithreaded Environment",
crossref = "USENIX:1992:PSU",
pages = "45--54",
month = "Summer",
year = "1992",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
affiliation = "Hewlett--Packard Laboratories",
author = "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and
Yoshinori Yamaguchi and Yasuhito Koumura",
title = "Thread-based programming for the {EM-4} hybrid
dataflow machine",
journal = j-COMP-ARCH-NEWS,
volume = "20",
number = "2",
pages = "146--155",
month = may,
year = "1992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Karsten Schwan and Hongyi Zhou",
title = "Multiprocessor real-time threads",
journal = j-OPER-SYS-REV,
volume = "26",
number = "1",
pages = "54--65",
month = jan,
year = "1992",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Gurjot Singh and Moses Joseph and Dave Barnett",
title = "Debugging real-time systems",
journal = j-DDJ,
volume = "17",
number = "9",
pages = "70, 72, 74, 76--77, 116--117",
month = sep,
year = "1992",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 10:06:23 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Modular and incremental development and debugging lead
to reliable real-time systems that perform the
functions they're designed to. Our authors use this
approach when building a simulated data-acquisition
acknowledgement = ack-nhfb,
affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA",
classification = "C6150G (Diagnostic, testing, debugging and
evaluating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time
systems; User-friendly multithreaded debugger;
Worst-case performance",
thesaurus = "C listings; Program debugging; Real-time systems",
author = "Gurjot Singh and Moses Joseph and Dave Barnett",
title = "Debugging real-time systems",
journal = j-DDJ,
volume = "17",
number = "9",
pages = "70, 72, 74, 76--77, 116--117",
month = sep,
year = "1992",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 10:06:23 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Modular and incremental development and debugging lead
to reliable real-time systems that perform the
functions they're designed to. Our authors use this
approach when building a simulated data-acquisition
acknowledgement = ack-nhfb,
affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA",
classification = "C6150G (Diagnostic, testing, debugging and
evaluating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time
systems; User-friendly multithreaded debugger;
Worst-case performance",
thesaurus = "C listings; Program debugging; Real-time systems",
author = "John Allen Smith",
title = "The Multi-Threaded {X} Server",
journal = j-X-RESOURCE,
volume = "1",
number = "1",
pages = "73--89",
month = jan,
year = "1992",
ISBN = "0-937175-96-X",
ISBN-13 = "978-0-937175-96-5",
ISSN = "1058-5591",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The X Resource",
author = "Helene Wen-Hsin Young-Myers",
title = "Database transitive closure: a performance study of
multithreaded algorithms",
type = "Thesis ({Ph.D.})",
school = "College of Business and Management, University of
Maryland at College Park",
address = "College Park, MD, USA",
pages = "ix + 198",
year = "1992",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Anant Agarwal and Jonathan Babb and David Chaiken and
Godfrey D'Souza and Kirk Johnson and David Kranz and
John Kubiatowicz and Beng-Hong Lim and Gino Maa and Ken
title = "Sparcle: a Multithreaded {VLSI} Processor for
Parallel Computing",
journal = j-LECT-NOTES-COMP-SCI,
volume = "748",
pages = "359--??",
year = "1993",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:49:00 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Lubomir Bic and Mayez Al-Mouhamed",
title = "The {EM-4} under Implicit Parallelism",
journal = j-J-PAR-DIST-COMP,
volume = "19",
number = "3",
pages = "255--261",
month = nov,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1109",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:53 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C6110P (Parallel
corpsource = "Dept. of Inf. and Comput. Sci., California Univ.,
Irvine, CA, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "analysis; benchmark programs; data distribution;
data-dependency; Data-Distributed Execution; DDE; EM-4;
implicit parallelism; interprocessor communication;
iteration-level parallelism; loops; multithreading;
parallel architectures; parallel programming;
treatment = "P Practical; T Theoretical or Mathematical",
author = "Robert D. Blumofe and Charles E. Leiserson",
title = "Space-efficient scheduling of multithreaded
crossref = "ACM:1993:PTF",
pages = "362--371",
year = "1993",
bibdate = "Wed Feb 20 18:34:01 MST 2002",
bibsource = "http://www.acm.org/pubs/contents/proceedings/series/stoc/;
URL = "http://www.acm.org/pubs/articles/proceedings/stoc/167088/p362-blumofe/p362-blumofe.pdf;
acknowledgement = ack-nhfb,
author = "Bob Boothe",
title = "Evaluation of multithreading and caching in large
shared memory parallel computers",
type = "Thesis ({Ph.D.})",
school = "University of California, Berkeley, Computer Science
address = "Berkeley, CA, USA",
pages = "ix + 169",
month = jul,
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Also available as Report UCB/CSD 93/766.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the Air Force Office of
Scientific Research (AFOSR/JSEP), by the NSF, and by an
NSF Infrastructure Grant.",
keywords = "Multiprocessors",
author = "Yong-Kim Chong",
title = "Effects of memory consistency models on multithreaded
multiprocessor performance",
type = "Thesis ({M.S.})",
school = "University of Southern California",
address = "Los Angeles, CA, USA",
pages = "viii + 89",
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "David E. Culler and Seth Copen Goldstein and Klaus
Erik Schauser and Thorsten {Von Eicken}",
title = "{TAM} -- a Compiler Controlled {Threaded Abstract
journal = j-J-PAR-DIST-COMP,
volume = "18",
number = "3",
pages = "347--370",
month = jul,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1070",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:52 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture)",
corpsource = "Div. of Comput. Sci., California Univ., Berkeley, CA,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "dataflow execution models; parallel architectures;
parallel programming; parallel threads; self-scheduled
machine language; Threaded Abstract Machine",
treatment = "P Practical",
author = "Laura K. Dillon",
title = "A visual execution model for {Ada} tasking",
journal = j-TOSEM,
volume = "2",
number = "4",
pages = "311--345",
month = oct,
year = "1993",
ISSN = "1049-331X (print), 1557-7392 (electronic)",
ISSN-L = "1049-331X",
bibdate = "Fri Apr 20 08:21:35 MDT 2001",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/articles/journals/tosem/1993-2-4/p311-dillon/p311-dillon.pdf;
abstract = "A visual execution model for Ada tasking can help
programmers attain a deeper understanding of the
tasking semantics. It can illustrate subtleties in
semantic definitions that are not apparent in natural
language design. We describe a contour model of Ada
tasking that depicts asynchronous tasks (threads of
control), relationships between the environments in
which tasks execute, and the manner in which tasks
interact. The use of this high-level execution model
makes it possible to see what happens during execution
of a program. The paper provides an introduction to the
contour model of Ada tasking and demonstrates its
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Software Engineering and
generalterms = "Algorithms; Design; Languages",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790",
keywords = "contour model; visual execution model",
subject = "Software --- Software Engineering --- Design Tools and
Techniques (D.2.2); Software --- Software Engineering
--- Programming Environments (D.2.6); Software ---
Programming Languages --- Formal Definitions and Theory
(D.3.1): {\bf Semantics}; Software --- Programming
Languages --- Language Classifications (D.3.2): {\bf
Ada}; Software --- Programming Languages --- Language
Constructs and Features (D.3.3): {\bf Concurrent
programming structures}; Software --- Programming
Techniques --- Concurrent Programming (D.1.3); Theory
of Computation --- Logics and Meanings of Programs ---
Semantics of Programming Languages (F.3.2): {\bf
Operational semantics}; Software --- Programming
Languages --- Processors (D.3.4): {\bf Interpreters}",
author = "Damien Doligez and Xavier Leroy",
title = "A concurrent, generational garbage collector for a
multithreaded implementation of {ML}",
crossref = "ACM:1993:CRT",
pages = "113--123",
year = "1993",
bibdate = "Mon May 3 12:45:53 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p113-doligez/",
abstract = "This paper presents the design and implementation of a
``quasi real-time'' garbage collector for Concurrent
Caml Light, an implementation of ML with threads. This
two-generation system combines a fast, asynchronous
copying collector on the young generation with a
non-disruptive concurrent marking collector on the old
generation. This design crucially relies on the ML
compile-time distinction between mutable and immutable
acknowledgement = ack-nhfb,
keywords = "algorithms; design; experimentation; languages;
subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
Constructs and Features, Concurrent programming
structures. {\bf D.3.4} Software, PROGRAMMING
LANGUAGES, Processors, Compilers. {\bf D.3.2} Software,
PROGRAMMING LANGUAGES, Language Classifications, LML.",
author = "Derek L. Eager and John Jahorjan",
title = "Chores: Enhanced Run-Time Support for Shared-Memory
Parallel Computing",
journal = j-TOCS,
volume = "11",
number = "1",
pages = "1--32",
month = feb,
year = "1993",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p1-eager/",
abstract = "Parallel computing is increasingly important in the
solution of large-scale numerical problems. The
difficulty of efficiently hand-coding parallelism, and
the limitations of parallelizing compilers, have
nonetheless restricted its use by scientific
programmers. In this paper we propose a new paradigm,
{\em chores}, for the run-time support of parallel
computing on shared-memory multiprocessors. We consider
specifically uniform memory access shared-memory
environments, although the chore paradigm should also
be appropriate for use within the clusters of a
large-scale nonuniform memory access machine. We argue
that chore systems attain both the high efficiency of
compiler approaches for the common case of data
parallelism, and the flexibility and performance of
user-level thread approaches for functional
parallelism. These benefits are achieved within a
single, simple conceptual model that almost entirely
relieves the programmer and compiler from concerns of
granularity, scheduling, and enforcement of
synchronization constraints. Measurements of a
prototype implementation demonstrate that the chore
model can be supported more efficiently than can
traditional approaches to either data or functional
parallelism alone.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "design; measurement; performance",
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management. {\bf D.4.9} Software, OPERATING SYSTEMS,
Systems Programs and Utilities. {\bf D.4.7} Software,
OPERATING SYSTEMS, Organization and Design, Distributed
systems. {\bf C.3} Computer Systems Organization,
C.4} Computer Systems Organization, PERFORMANCE OF
author = "James L. Estep",
title = "Lightweight multithreaded multimedia conference
type = "Thesis ({M.S.})",
school = "West Virginia University",
address = "Morgantown, WV, USA",
pages = "vi + 57",
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Electronic data processing -- Distributed processing;
Multimedia systems",
author = "Xiaoming Fan",
title = "Latency-directed multithreaded computation and its
architectural support",
type = "Thesis ({Ph.D.})",
school = "Universit{\"a}t Hamburg",
address = "Aachen, Germany",
pages = "xi + 174 + 22 + 11",
year = "1993",
ISBN = "3-8265-0021-0",
ISBN-13 = "978-3-8265-0021-3",
ISSN = "0945-0807",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Summary in German.",
series = "Berichte aus der Informatik",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Parallel processing (Electronic
author = "Guang Gao and Jean-Luc Gaudiot and Lubomir Bic",
title = "Dataflow and Multithreaded Architectures: {Guest
Editors}' Introduction",
journal = j-J-PAR-DIST-COMP,
volume = "18",
number = "3",
pages = "271--??",
month = jul,
year = "1993",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Apr 12 16:10:59 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
xxnote = "Issue missing from UofUtah Marriott Library??",
author = "G. R. Gao",
title = "An Efficient Hybrid Dataflow Architecture Model",
journal = j-J-PAR-DIST-COMP,
volume = "19",
number = "4",
pages = "293--307",
month = dec,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1113",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:53 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C6110P (Parallel
programming)C6150N (Distributed systems); C6150C
(Compilers, interpreters and other processors)",
corpsource = "Adv. Comput. Archit. and Program Structures Group,
Montreal Univ., Que., Canada",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "architecture technique; compiling paradigm; concurrent
operation; conventional; data-driven instruction;
data-driven scheduling scheme; dataflow computers;
dataflow software pipelining; efficient hybrid dataflow
architecture model; execution; fast pipelined
instruction; fine-grain parallelism; hybrid; limited
balancing; loop parallelism; multiple instruction;
parallel architectures; parallel programming; pipeline;
processing; program compilers; scheduling; simple
greedy runtime; space efficiency; threads",
treatment = "P Practical",
author = "Guang R. Gao and Jean-Luc Gaudiot and Lubomir Bic",
title = "Special issue on dataflow and multithreaded
publisher = pub-AP,
address = pub-AP:adr,
pages = "271--389",
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Journal of parallel and distributed computing; v. 18,
no. 3",
acknowledgement = ack-nhfb,
author = "E. W. Giering and F. Mueller and T. P. Baker",
title = "Implementing {Ada 9X} Features using {POSIX} Threads:
Design Issues",
crossref = "ACM:1993:TCS",
pages = "214--228",
year = "1993",
bibdate = "Sat Jul 05 17:12:34 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Stephen Gildea",
title = "Multi-Threaded {Xlib}",
journal = j-X-RESOURCE,
volume = "5",
number = "1",
pages = "159--166",
month = jan,
year = "1993",
ISBN = "1-56592-020-1",
ISBN-13 = "978-1-56592-020-0",
ISSN = "1058-5591",
bibdate = "Tue Mar 23 12:38:27 1993",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The X Resource",
author = "Carl Hauser and Christian Jacobi and Marvin Theimer
and Brent Welch and Mark Weiser",
title = "Using threads in interactive systems: a case study",
journal = j-OPER-SYS-REV,
volume = "27",
number = "5",
pages = "94--105",
month = dec,
year = "1993",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:54 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Charles Hayden",
title = "A brief introduction to {Concurrent Pascal}",
journal = j-SIGPLAN,
volume = "28",
number = "3",
pages = "353--354",
month = mar,
year = "1993",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:34 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/154766/p353-hayden/",
abstract = "Concurrent Pascal is designed for writing concurrent
programs such as operating systems and real-time
monitoring systems on shared-memory computers. A
separate language, Sequential Pascal, is used as the
language for applications programs run by operating
systems written in Concurrent Pascal. Both languages
are extensions of Wirth's Pascal, and share a common
threaded code interpreter. The article describes how
Concurrent Pascal differs from Wirth's Pascal.",
acknowledgement = ack-nhfb,
affiliation = "AT and T Bell Labs., Middletown, NJ, USA",
classification = "C6110P (Parallel programming); C6140D (High level
confdate = "20-23 April 1993",
conflocation = "Cambridge, MA, USA",
confname = "HOPL-II. The second ACM SIGPLAN conference on History
of programming languages, April 20--23, 1993,
Cambridge, MA",
confsponsor = "ACM",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Concurrent Pascal; languages; Operating systems;
Real-time monitoring systems; Sequential Pascal;
Shared-memory computers; Threaded code interpreter",
subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, Concurrent Pascal. {\bf D.3.2}
Classifications, Pascal. {\bf D.3.3} Software,
PROGRAMMING LANGUAGES, Language Constructs and
Features, Procedures, functions, and subroutines.",
thesaurus = "Parallel languages; Pascal",
author = "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka",
title = "Multiple threads in cyclic register windows",
journal = j-COMP-ARCH-NEWS,
volume = "21",
number = "2",
pages = "131--142",
month = may,
year = "1993",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:46 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Wilson C. Hsieh and Paul Wang and William E. Weihl",
title = "Computation migration: enhancing locality for
distributed-memory parallel systems",
journal = j-SIGPLAN,
volume = "28",
number = "7",
pages = "239--248",
month = jul,
year = "1993",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:39 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Computation migration is a technique that is based on
compile-time program transformation, for accessing
remote data in a distributed-memory parallel system. In
contrast with RPC-style access, where the access is
performed remotely, and with data migration, where the
data is moved so that it is local, computation
migration moves put of the current thread to the
processor where the data resides. The access is
performed at the remote processor, and the migrated
thread portion continues to run on that same processor;
this makes subsequent accesses in the thread portion
local. The authors describe an implementation of
computation migration that consists of two parts: a
implementation that migrates single activation frames,
and a high-level language annotation that allows a
programmer to express when migration is desired. They
performed experiments using two applications; these
experiments demonstrate that computation migration is a
valuable alternative to RPC and data migration.",
acknowledgement = ack-nhfb,
affiliation = "Lab. of Comput. Sci., MIT, Cambridge, MA, USA",
classification = "C6110P (Parallel programming); C6120 (File
organisation); C6150C (Compilers, interpreters and
other processors)",
confdate = "19-22 May 1993",
conflocation = "San Diego, CA, USA",
confsponsor = "ACM",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Compile-time program transformation; Computation
migration; Current thread; Distributed-memory parallel
system; High-level language annotation; Remote data;
Remote processor; Single activation frames",
thesaurus = "Distributed memory systems; Parallel programming;
Program compilers; Storage management",
author = "Lorenz Huelsbergen and James R. Larus",
title = "A concurrent copying garbage collector for languages
that distinguish (im)mutable data",
journal = j-SIGPLAN,
volume = "28",
number = "7",
pages = "73--82",
month = jul,
year = "1993",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:39 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. Sci., Wisconsin-Madison Univ., WI,
classification = "C6110P (Parallel programming); C6120 (File
organisation); C6150C (Compilers, interpreters and
other processors); C6150N (Distributed systems)",
confdate = "19-22 May 1993",
conflocation = "San Diego, CA, USA",
confsponsor = "ACM",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Concurrent collection; Concurrent compacting garbage
collector; Garbage-collection pauses; Immutable data;
Minimal mutator/collector synchronization; Multiple
mutator threads; Mutable data; Pure functional
languages; Shared-memory parallel computers; Standard
ML compiler",
thesaurus = "Parallel programming; Program compilers; Shared memory
systems; Storage allocation; Storage management",
author = "Nils Klarlund and Michael I. Schwartzbach",
title = "Graph types",
crossref = "ACM:1993:CRT",
pages = "196--205",
year = "1993",
bibdate = "Mon May 3 12:45:53 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p196-klarlund/",
abstract = "Recursive data structures are abstractions of simple
records and pointers. They impose a shape invariant,
which is verified at compile-time and exploited to
automatically generate code for building, copying,
comparing, and traversing values without loss of
efficiency. However, such values are always tree
shaped, which is a major obstacle to practical use. We
propose a notion of graph types, which allow common
shapes, such as doubly-linked lists or threaded trees,
to be expressed concisely and efficiently. We define
regular languages of routing expressions to specify
relative addresses of extra pointers in a canonical
spanning tree. An efficient algorithm for computing
such addresses is developed. We employ a second-order
monadic logic to decide well-formedness of graph type
specifications. This logic can also be used for
automated reasoning about pointer structures.",
acknowledgement = ack-nhfb,
keywords = "algorithms; languages; theory",
subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
OF PROGRAMS, Studies of Program Constructs, Type
structure. {\bf D.3.3} Software, PROGRAMMING LANGUAGES,
Language Constructs and Features, Data types and
structures. {\bf F.2.2} Theory of Computation, ANALYSIS
Algorithms and Problems, Computations on discrete
structures. {\bf G.2.2} Mathematics of Computing,
DISCRETE MATHEMATICS, Graph Theory, Trees.",
author = "K. W. Koontz",
title = "Port buffers: a {Mach IPC} optimization for handling
large volumes of small messages",
crossref = "USENIX:1993:PUMb",
pages = "89--102",
year = "1993",
bibdate = "Sat Sep 28 18:52:45 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Appl. Phys. Lab., Johns Hopkins Univ., Laurel, MD,
classification = "C6150N (Distributed systems)",
keywords = "Communications mechanism; Context switches;
Distributed systems; Ethernet; High-speed networks;
Kernel calls; Local transfer rates; Mach IPC
optimization; Mach kernel; Multi-threaded support;
Network utilization; Nonshared memory parallel
architectures; Port buffers; Staleness feature",
thesaurus = "Buffer storage; Electronic messaging; Network
operating systems; Optimisation; Remote procedure
author = "David Lee",
title = "Threads for {Windows} 3",
journal = j-DDJ,
volume = "18",
number = "10",
pages = "84--??",
month = "Fall",
year = "1993",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:44 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
note = "Special Issue: Windows Sourcebook.",
abstract = "Unlike NT, Windows 3 doesn't provide direct support
for threads. With the techniques David illustrates
here, you can implement non-preemptive threads in
Windows 3.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Beng-Hong Lim and Anant Agarwal",
title = "Waiting Algorithms for Synchronization in Large-Scale
journal = j-TOCS,
volume = "11",
number = "3",
pages = "253--294",
month = aug,
year = "1993",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p253-lim/",
abstract = "Through analysis and experiments, this paper
investigates two-phase waiting algorithms to minimize
the cost of waiting for synchronization in large-scale
multiprocessors. In a two-phase algorithm, a thread
first waits by polling a synchronization variable. If
the cost of polling reaches a limit {\em Lpoll\/} and
further waiting is necessary, the thread is blocked,
incurring an additional fixed cost, {\em B}. The choice
of {\em Lpoll\/} is a critical determinant of the
performance of two-phase algorithms. We focus on
methods for statically determining {\em Lpoll\/}
because the run-time overhead of dynamically
determining {\em Lpoll\/} can be comparable to the cost
of blocking in large-scale multiprocessor systems with
lightweight threads. Our experiments show that {\em
always-block\/} ({\em Lpoll\/} = 0) is a good waiting
algorithm with performance that is usually close to the
best of the algorithms compared. We show that even
better performance can be achieved with a static choice
of {\em Lpoll\/} based on knowledge of likely wait-time
distributions. Motivated by the observation that
different synchronization types exhibit different
wait-time distributions, we prove that a static choice
of {\em Lpoll\/} can yield close to optimal on-line
performance against an adversary that is restricted to
choosing wait times from a fixed family of probability
distributions. This result allows us to make an optimal
static choice of {\em Lpoll\/} based on synchronization
type. For exponentially distributed wait times, we
prove that setting {\em Lpoll\/} = 1n(e-1){\em B\/}
results in a waiting cost that is no more than {\em
e/(e-1)\/} times the cost of an optimal off-line
algorithm. For uniformly distributed wait times, we
prove that setting {\em L\/}poll=1/2(square root of 5
-1){\em B\/} results in a waiting cost that is no more
than (square root of 5 + 1)/2 (the golden ratio) times
the cost of an optimal off-line algorithm. Experimental
measurements of several parallel applications on the
Alewife multiprocessor simulator corroborate our
theoretical findings.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "algorithms; experimentation; performance; theory",
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management, Synchronization. {\bf D.4.1} Software,
OPERATING SYSTEMS, Process Management, Mutual
exclusion. {\bf C.4} Computer Systems Organization,
PERFORMANCE OF SYSTEMS. {\bf C.1.2} Computer Systems
Organization, PROCESSOR ARCHITECTURES, Multiple Data
Stream Architectures (Multiprocessors), Parallel
processors**. {\bf D.4.8} Software, OPERATING SYSTEMS,
Performance, Measurements. {\bf D.4.8} Software,
OPERATING SYSTEMS, Performance, Stochastic analysis.",
author = "Cathy McCann and Raj Vaswani and John Zahorjan",
title = "A Dynamic Processor Allocation Policy for
Multiprogrammed Shared-Memory Multiprocessors",
journal = j-TOCS,
volume = "11",
number = "2",
pages = "146--178",
month = may,
year = "1993",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p146-mccann/",
abstract = "We propose and evaluate empirically the performance of
a dynamic processor-scheduling policy for
multiprogrammed shared-memory multiprocessors. The
policy is dynamic in that it reallocates processors
from one parallel job to another based on the currently
realized parallelism of those jobs. The policy is
suitable for implementation in production systems in
that: ---It interacts well with very efficient
user-level thread packages, leaving to them many
low-level thread operations that do not require kernel
intervention. ---It deals with thread blocking due to
user I/O and page faults. ---It ensures fairness in
delivering resources to jobs. ---Its performance,
measured in terms of average job response time, is
superior to that of previously proposed schedulers,
including those implemented in existing systems. It
provides good performance to very short, sequential
(e.g., interactive) requests. We have evaluated our
scheduler and compared it to alternatives using a set
of prototype implementations running on a Sequent
Symmetry multiprocessor. Using a number of parallel
applications with distinct qualitative behaviors, we
have both evaluated the policies according to the major
criterion of overall performance and examined a number
of more general policy issues, including the advantage
of ``space sharing'' over ``time sharing'' the
processors of a multiprocessor, and the importance of
cooperation between the kernel and the application in
reallocating processors between jobs. We have also
compared the policies according to other criteia
important in real implementations, in particular,
fairness and respone time to short, sequential
requests. We conclude that a combination of performance
and implementation considerations makes a compelling
case for our dynamic scheduling policy.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "design; measurement; performance",
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management, Scheduling. {\bf D.4.1} Software, OPERATING
SYSTEMS, Process Management,
Multiprocessing/multiprogramming/multitasking. {\bf
C.1.2} Computer Systems Organization, PROCESSOR
ARCHITECTURES, Multiple Data Stream Architectures
author = "J. Gregory Morrisett and Andrew P. Tolmach",
title = "Procs and locks: a portable multiprocessing platform
for {Standard ML} of {New Jersey}",
journal = j-SIGPLAN,
volume = "28",
number = "7",
pages = "198--207",
month = jul,
year = "1993",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:39 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "A portable platform has been built for running
Standard ML of New Jersey programs on multiprocessors.
It can be used to implement user-level thread packages
for multiprocessors within the ML language with
first-class continuations. The platform supports
experimentation with different thread scheduling
policies and synchronization constructs. It has been
used to construct a Modula-3 style thread package and a
version of Concurrent ML, and has been ported to three
different multiprocessors running variants of Unix. The
authors describe the platform's design, implementation,
and performance.",
acknowledgement = ack-nhfb,
affiliation = "Carnegie Mellon Univ., Pittsburg, PA, USA",
classification = "C6110P (Parallel programming); C6140D (High level
languages); C6150C (Compilers, interpreters and other
confdate = "19-22 May 1993",
conflocation = "San Diego, CA, USA",
confsponsor = "ACM",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Concurrent ML; First-class continuations; Functional
language; Modula-3 style thread package; New Jersey
programs; Portable multiprocessing platform; Portable
platform; Standard ML; Synchronization constructs;
Thread scheduling policies; User-level thread
thesaurus = "Multiprocessing systems; Parallel languages; Parallel
programming; Scheduling",
author = "Walid A. Najjar and A. P. Wim Bohm and W. Marcus
title = "A Quantitative Analysis of Dataflow Program Execution
--- Preliminaries to a Hybrid Design",
journal = j-J-PAR-DIST-COMP,
volume = "18",
number = "3",
pages = "314--326",
month = jul,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1067",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:52 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production;
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6110P
(Parallel programming)",
corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort
Collins, CO, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "benchmarks; dataflow program execution; dynamic
measure; fine grain intrathread locality; instruction
level locality; parallel programming; software
treatment = "T Theoretical or Mathematical",
author = "Venkat Natarajan and Derek Chiou and Boon Seong Ang",
title = "Performance visualization on {Monsoon}",
journal = j-J-PAR-DIST-COMP,
volume = "18",
number = "2",
pages = "169--180",
month = jun,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1054",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:52 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production;
acknowledgement = ack-nhfb,
classification = "C5440 (Multiprocessor systems and techniques); C5470
(Performance evaluation and testing); C7430 (Computer
corpsource = "Motorola Cambridge Res. Center, MA, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "algorithm; application program; compiler; computer
evaluation; data analysis; data collection; data
visualisation; MIT; Monsoon; Motorola; multiprocessor
machine; multithreaded; operating system; parallel
machine; parallel machines; performance evaluation;
performance evaluation tool; programming language;
treatment = "P Practical",
author = "Martin Odersky and Dan Rabin and Paul Hudak",
title = "Call by name, assignment, and the lambda calculus",
crossref = "ACM:1993:CRT",
pages = "43--56",
year = "1993",
bibdate = "Mon May 3 12:45:53 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p43-odersky/",
abstract = "We define an extension of the call-by-name lambda
calculus with additional constructs and reduction rules
that represent mutable variables and assignments. The
extended calculus has neither a concept of an explicit
store nor a concept of evaluation order; nevertheless,
we show that programs in the calculus can be
implemented using a single-threaded store. We also show
that the new calculus has the Church--Rosser property
and that it is a conservative extension of classical
lambda calculus with respect to operational
equivalence; that is, all algebraic laws of the
functional subset are preserved.",
acknowledgement = ack-nhfb,
keywords = "languages; theory",
subject = "{\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC
AND FORMAL LANGUAGES, Mathematical Logic, Lambda
calculus and related systems. {\bf F.3.3} Theory of
of Program Constructs, Type structure.",
author = "Dave Plauger",
title = "Making {C++} Save for Threads",
journal = j-CUJ,
volume = "11",
number = "2",
pages = "58--??",
month = feb,
year = "1993",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "M. T. Raghunath and Abhiram Ranade",
title = "Designing Interconnection Networks for Multi-Level
crossref = "IEEE:1993:PSP",
pages = "772--781",
year = "1993",
bibdate = "Wed Apr 15 12:04:03 MDT 1998",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Univ of California",
affiliationaddress = "Berkeley, CA, USA",
classification = "723; C5220P (Parallel architecture); C5440
(Multiprocessing systems)",
corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA,
keywords = "communication bandwidth; complete graphs; Computer
networks; generic set; global communication
performance; high bandwidth channels; high degree
deBruijn graphs; Interconnection network design;
interconnection networks design; Large scale parallel
machines; large scale parallel machines; latencies;
Multilevel packaging; multilevel packaging;
multiprocessor interconnection networks;
multithreading; network organizations; network
topology; packaging; packaging constraints; packaging
hierarchy; packaging restrictions; packaging
technology; Parallel processing systems; Random traffic
model; random traffic model",
sponsororg = "IEEE; ACM SIGARCH",
treatment = "P Practical",
author = "Arjun Rajagopal",
title = "Design of a multithreaded instruction cache for a
hyperscalar processor",
type = "Thesis ({M.S.})",
school = "Department of Electrical Engineering, Texas A\&M
address = "College Station, TX, USA",
pages = "ix + 84",
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Major electrical engineering",
author = "Sunil Saxena and J. Kent Peacock and Fred Yang and
Vijaya Verma and Mohan Krishnan",
title = "Pitfalls in Multithreading {SVR4 STREAMS} and Other
Weightless Processes",
crossref = "USENIX:1993:PWU",
pages = "85--96",
month = "Winter",
year = "1993",
bibdate = "Tue Oct 22 08:14:49 2002",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://www.usenix.org/publications/library/proceedings/sd93/",
acknowledgement = ack-nhfb,
affiliation = "Intel Multiprocessor Consortium",
author = "Carl Schmidtmann and Michael Tao and Steven Watt",
title = "Design and Implementation of a Multi-Threaded {Xlib}",
crossref = "USENIX:1993:PWU",
pages = "193--203",
month = "Winter",
year = "1993",
bibdate = "Tue Oct 22 08:16:35 2002",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://www.usenix.org/publications/library/proceedings/sd93/",
acknowledgement = ack-nhfb,
affiliation = "Consultant to Digital Equipment Corporation; Sun
Microsystems; Consultant to Xerox Corporation",
author = "Sumathi Srinivasan",
title = "System design and simulation for the {Demus-2}
multithreaded processor",
type = "Thesis ({M. Eng.})",
school = "Department of Electrical and Computer Engineering,
McMaster University",
address = "Hamilton, ON, Canada",
pages = "x + 109",
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Computers, Pipeline; McMaster
University. -- Dissertations; Parallel processing
(Electronic computers)",
author = "Victor R. Volkman",
title = "Convert {C} Programs into Multithreaded Applications",
journal = j-CUJ,
volume = "11",
type = "User Report",
number = "4",
pages = "87--??",
month = apr,
year = "1993",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Victor R. Volkman and John English",
title = "Class {{\tt DOSThread}}: a Base Class for
Multithreaded {DOS} Programs",
journal = j-CUJ,
volume = "11",
type = "CUG library disk documentation",
number = "12",
pages = "113--??",
month = dec,
year = "1993",
ISSN = "0898-9788",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C Users Journal",
author = "Carl A. Waldspurger and William E. Weihl",
title = "Register relocation: flexible contexts for
journal = j-COMP-ARCH-NEWS,
volume = "21",
number = "2",
pages = "120--130",
month = may,
year = "1993",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:46 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Helene Young-Myers and Louiqa Raschid",
title = "An experimental study of three dataflow paradigms in
multithreaded database transitive closure algorithms on
shared memory multiprocessors",
type = "Technical report",
number = "CS-TR-3060; UMIACS-TR-93-33",
institution = inst-U-MARYLAND,
address = inst-U-MARYLAND:adr,
pages = "21",
month = apr,
year = "1993",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "To appear in a special issue of the Journal of
Parallel and Distributed Computing on Dataflow and
Multithreaded Architectures, July, 1993.",
abstract = "This paper describes an experimental study of three
dataflow paradigms, namely, no dataflow, pipelined
dataflow, and network dataflow, in multithreaded
database transitive closure algorithms on shared memory
multiprocessors. This study shows that dataflow
paradigm directly influences performance parameters
such as the amount of interthread communication, how
data are partitioned among the threads, whether access
to each page of data is exclusive or shared, whether
locks are needed for concurrency control, and how
calculation termination is detected. The algorithm
designed with no dataflow outperforms the algorithms
with dataflow. Approximately linear speedup is achieved
by the no dataflow algorithm with sufficient workload
and primary memory. An exclusive access working set
model and a shared access working set model describe
the interactions between two or more threads' working
sets when access to each page of data is exclusive or
shared among the threads, respectively. These models
are experimentally verified.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science
keywords = "Data flow computing; Multiprocessors",
author = "Helene Young-Myers and Louiqa Raschid",
title = "An Experimental Study of Three Dataflow Paradigms in
Multithreaded Database Transitive Closure Algorithms on
Shared Memory Multiprocessors",
journal = j-J-PAR-DIST-COMP,
volume = "18",
number = "3",
pages = "371--389",
month = jul,
year = "1993",
DOI = "https://doi.org/10.1006/jpdc.1993.1071",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:52 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C5470 (Performance
evaluation and testing); C6160 (Database management
systems (DBMS))",
corpsource = "Maryland Univ., College Park, MD, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "architectures; calculation termination; concurrency
control; database management systems; dataflow;
dataflow paradigms; exclusive access; interthread
communication; linear; network; no dataflow; parallel;
performance evaluation; performance parameters;
pipelined dataflow; shared access; shared memory
systems; speedup",
treatment = "P Practical",
author = "R. A. Alfieri",
title = "An Efficient Kernel-Based Implementation of {POSIX}
crossref = "Anonymous:1994:USC",
pages = "59--72",
year = "1994",
bibdate = "Sat May 25 07:59:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Anonymous",
title = "On the Design of {Chant}: a Talking Threads
crossref = "IEEE:1994:PSW",
pages = "350--359",
year = "1994",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Anonymous",
title = "{Multiprocessor desktops are proliferating, even
though there remains a shortage of multithreaded
applications for them}",
volume = "165",
pages = "60--??",
month = dec,
year = "1994",
ISSN = "1061-0839",
bibdate = "Fri Jan 26 17:24:01 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Open Systems Today",
author = "Anonymous",
title = "Special issue: panel sessions of the {1991 Workshop on
Multithreaded Computers, November 22, 1991,
Albuquerque, New Mexico, in conjunction with
Supercomputing '91}",
journal = "Computer architecture news",
volume = "22",
number = "1",
pages = "2--33",
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Anonymous",
title = "{Wanted: The Multithreaded CIO}",
journal = j-DATAMATION,
volume = "40",
number = "8",
pages = "34--??",
day = "15",
month = apr,
year = "1994",
ISSN = "0011-6963",
bibdate = "Sat Jan 27 07:35:21 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Technician or business manager? If you want to be a
CIO, you better be both. Add to that a host of
communications skills and an ability to travel in
diverse circles, and you're on your way to being the
Multithreaded CIO of the 1990s.",
acknowledgement = ack-nhfb,
fjournal = "Datamation",
author = "T. P. Baker and Frank Mueller and Viresh Rustagi",
title = "Experience with a Prototype of the {POSIX} ``Minimal
Realtime System Profile''",
crossref = "IEEE:1994:ROS",
pages = "12--17",
year = "1994",
bibdate = "Sat May 25 07:59:58 MDT 1996",
bibsource = "Compendex database;
abstract = "This paper describes experience prototyping the
proposed IEEE standard `minimal realtime system
profile', whose primary component is support for
real-time threads. It provides some background,
describes the implementation, and reports preliminary
performance measurements.",
acknowledgement = ack-nhfb,
affiliation = "Florida State Univ",
affiliationaddress = "Tallahassee, FL, USA",
classification = "722.4; 723.1; 723.1.1; 723.2",
conference = "Proceedings of the 11th IEEE Workshop on Real-Time
Operating Systems and Software",
conferenceyear = "1994",
journalabr = "Proc IEEE Workshop Real Time Oper Syst Software",
keywords = "Computer operating systems; Computer software
portability; Data structures; High level languages;
Interfaces (computer); Mesa programming language;
Minimal real time system profile; Program processors;
Real time systems; Thread; Thread management; Thread
priority scheduling",
meetingaddress = "Seattle, WA, USA",
meetingdate = "May 18--19 1994",
meetingdate2 = "05/18--19/94",
publisherinfo = "Computer Society Press",
sponsor = "IEEE Computer Society",
author = "Carlos Baquero and Francisco Moura",
title = "Concurrency Annotations in {C++}",
journal = j-SIGPLAN,
volume = "29",
number = "7",
pages = "61--67",
month = jul,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:53 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6110P
(Parallel programming); C6140D (High level languages)",
corpsource = "DI/INESC, Minho Univ., Portugal",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "access flag; C language; C++; concurrency annotations;
inheritance; inheritance chain; language extension;
method code; method invocations; method predicates;
multiple threads; object-oriented languages; parallel
languages; shared-memory multiprocessor system;
synchronisation; synchronization code; synchronization
treatment = "P Practical",
author = "R. D. Blumofe and C. E. Leiserson",
title = "Scheduling multithreaded computations by work
crossref = "Goldwasser:1994:PAS",
pages = "356--368",
year = "1994",
bibdate = "Thu Apr 5 06:13:51 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "R. Buendgen and M. Goebel and W. Kuechlin",
title = "Multi-Threaded {AC} Term Rewriting",
crossref = "Hong:1994:FIS",
pages = "84--93",
year = "1994",
bibdate = "Thu Mar 12 11:28:58 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib;
acknowledgement = ack-nhfb,
author = "R. Buendgen and M. Goebel and W. Kuechlin",
title = "Multi-Threaded {AC} Term Rewriting",
crossref = "Hong:1994:FIS",
pages = "84--93",
year = "1994",
bibdate = "Thu Mar 12 11:28:58 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib;
acknowledgement = ack-nhfb,
author = "R. J. A. Buhr and R. S. Casselman",
title = "Timethread-Role Maps for Object-Oriented Design of
Real-Time-and-Distributed Systems",
journal = j-SIGPLAN,
volume = "29",
number = "10",
pages = "301--301",
month = oct,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Apr 24 18:36:02 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6150N
(Distributed systems)",
conflocation = "Portland, OR, USA; 23-27 Oct. 1994",
conftitle = "Ninth Annual Conference on Object-Oriented Programming
Systems, Languages, and Applications. OOPSLA '94",
corpsource = "Dept. of Syst. and Comput. Eng., Carleton Univ.,
Ottawa, Ont., Canada",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; distributed processing; distributed
systems; dynamic structure; end-to-end responsibility
paths; object-oriented approach; object-oriented
design; object-oriented design methods; object-oriented
methods; object-oriented programming; real-time
systems; real-time systems oriented programming;
responsibility-driven design; timethread-role maps",
sponsororg = "ACM",
treatment = "P Practical",
author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and
Wolfgang K{\"u}chlin",
title = "A fine-grained parallel completion procedure",
crossref = "ACM:1994:IPI",
pages = "269--277",
year = "1994",
bibdate = "Thu Mar 12 08:41:19 MST 1998",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/190347/p269-bundgen/",
abstract = "We present a parallel Knuth--Bendix completion
algorithm where the inner loop, deriving the
consequences of adding a new rule to the system, is
multithreaded. The selection of the best new rule in
the outer loop, and hence the completion strategy, is
exactly the same as for the sequential algorithm. Our
implementation, which is within the PARSAC-2 parallel
symbolic computation system, exhibits good parallel
speedups on a standard multiprocessor workstation.",
acknowledgement = ack-nhfb,
affiliation = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
classification = "C4210L (Formal languages and computational
linguistics); C4240P (Parallel programming and
algorithm theory); C6130 (Data handling techniques);
C6150N (Distributed systems software); C7310
(Mathematics computing)",
keywords = "algorithms; Fine grained parallel completion
procedure; Fine-grained parallel completion procedure;
Multithreaded inner loop; Parallel Knuth--Bendix
completion algorithm; Parallel speedups; PARSAC-2
parallel symbolic computation system; Standard
multiprocessor workstation",
subject = "{\bf I.1.2} Computing Methodologies, SYMBOLIC AND
algorithms. {\bf I.1.0} Computing Methodologies,
I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC
MANIPULATION, Languages and Systems. {\bf F.4.2} Theory
LANGUAGES, Grammars and Other Rewriting Systems,
Parallel rewriting systems. {\bf F.1.2} Theory of
Computation, Parallelism and concurrency.",
thesaurus = "Parallel algorithms; Parallel machines; Rewriting
systems; Symbol manipulation",
author = "Nicholas P. Carter and Stephen W. Keckler and William
J. Dally",
title = "Hardware support for fast capability-based
journal = j-SIGPLAN,
volume = "29",
number = "11",
pages = "319--327",
month = nov,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:57 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p319-carter/",
abstract = "Traditional methods of providing protection in memory
systems do so at the cost of increased context switch
time and/or increased storage to record access
permissions for processes. With the advent of computers
that supported cycle-by-cycle multithreading,
protection schemes that increase the time to perform a
context switch are unacceptable, but protecting
unrelated processes from each other is still necessary
if such machines are to be used in non-trusting
environments. This paper examines {\em guarded
pointers\/}, a hardware technique which uses tagged
64-bit pointer objects to implement capability-based
addressing. Guarded pointers encode a segment
descriptor into the upper bits of every pointer,
eliminating the indirection and related performance
penalties associated with traditional implementations
of capabilities. All processes share a single 54-bit
virtual address space, and access is limited to the
data that can be referenced through the pointers that a
process has been issued. Only one level of address
translation is required to perform a memory reference.
Sharing data between processes is efficient, and
protection states are defined to allow fast protected
subsystem calls and create unforgeable data keys.",
acknowledgement = ack-nhfb,
classification = "C5310 (Storage system design); C6120 (File
organisation); C6150N (Distributed systems software)",
conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
conftitle = "Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "54- bit virtual address space; address translation;
capability based addressing; cycle-by-cycle
multithreading; design; fast capability-based
addressing; fast protected subsystem calls; guarded
pointers; hardware support; hardware technique; memory
architecture; memory bit virtual address space; memory
reference; memory systems; multiprocessing programs;
performance; protection schemes; protection states;
segment descriptor; storage allocation; tagged 64-bit
pointer objects; theory; unforgeable data keys; virtual
sponsororg = "ACM; IEEE Comput. Soc",
subject = "{\bf C.0} Computer Systems Organization, GENERAL,
Instruction set design. {\bf C.4} Computer Systems
treatment = "P Practical",
author = "Ben J. Catanzaro",
title = "Multiprocessor system architectures: a technical
survey of multiprocessor\slash multithreaded systems
using {SPARC}, multilevel bus architectures and
{Solaris} {(SunOS)}",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
pages = "xxxii + 493",
year = "1994",
ISBN = "0-13-089137-1",
ISBN-13 = "978-0-13-089137-2",
LCCN = "QA76.5.C3864 1994",
bibdate = "Fri Aug 7 08:29:38 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "computer architecture; multiprocessors; sun
author = "Jeffrey S. Chase and Henry M. Levy and Michael J.
Feeley and Edward D. Lazowska",
title = "Sharing and Protection in a Single-Address-Space
Operating System",
journal = j-TOCS,
volume = "12",
number = "4",
pages = "271--307",
month = nov,
year = "1994",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p271-chase/",
abstract = "This article explores memory sharing and protection
support in Opal, a single-address-space operating
system designed for wide-address (64-bit)
architectures. Opal threads execute within protection
domains in a single shared virtual address space.
Sharing is simplified, because addresses are context
independent. There is no loss of protection, because
addressability and access are independent; the right to
access a segment is determined by the protection domain
in which a thread executes. This model enables
beneficial code-and data-sharing patterns that are
currently prohibitive, due in part to the inherent
restrictions of multiple address spaces, and in part to
Unix programming style. We have designed and
implemented an Opal prototype using the Mach 3.0
microkernel as a base. Our implementation demonstrates
how a single-address-space structure can be supported
alongside of other environments on a modern microkernel
operating system, using modern wide-address
architectures. This article justifies the Opal model
and its goals for sharing and protection, presents the
system and its abstractions, describes the prototype
implementation, and reports experience with integrated
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "design; experimentation; measurement; performance",
subject = "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage
Management. {\bf C.1.3} Computer Systems Organization,
PROCESSOR ARCHITECTURES, Other Architecture Styles,
Capability architectures**. {\bf D.3.3} Software,
PROGRAMMING LANGUAGES, Language Constructs and
Features, Modules, packages. {\bf D.4.4} Software,
OPERATING SYSTEMS, Communications Management. {\bf
D.4.6} Software, OPERATING SYSTEMS, Security and
Protection, Access controls. {\bf D.4.6} Software,
OPERATING SYSTEMS, Security and Protection, Information
flow controls. {\bf D.4.7} Software, OPERATING SYSTEMS,
Organization and Design. {\bf D.4.8} Software,
OPERATING SYSTEMS, Performance, Measurements. {\bf E.1}
author = "Ghulam Chaudhry and Xuechang Li",
title = "A case for the multithreaded processor architecture",
journal = j-COMP-ARCH-NEWS,
volume = "22",
number = "4",
pages = "55--59",
month = sep,
year = "1994",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:12 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jack B. Dennis",
title = "Machines and Models for Parallel Computing",
journal = j-INT-J-PARALLEL-PROG,
volume = "22",
number = "1",
pages = "47--77",
month = feb,
year = "1994",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Apr 26 11:04:14 MDT 1997",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=22&issue=1;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C5440
(Multiprocessor systems and techniques); C6110 (Systems
analysis and programming); C6150N (Distributed
corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "concurrency control; dataflow principles; functional
programming; general semantic model; memory latency;
microprocessors; modular software construction;
multithreading; parallel computation; parallel
computing models; parallel machines; parallel
programming; processor architecture; processor design;
RISC; shared memory systems; shared-memory model;
superpipelined; superscalar; synchronization",
treatment = "P Practical",
author = "Len Dorfman and Marc J. Neuberger",
title = "Effective multithreading in {OS/2}",
publisher = pub-MCGRAW-HILL,
address = pub-MCGRAW-HILL:adr,
pages = "xii + 288",
year = "1994",
ISBN = "0-07-017841-0 (paperback)",
ISBN-13 = "978-0-07-017841-0 (paperback)",
LCCN = "QA76.76.O63D6694 1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$34.95",
acknowledgement = ack-nhfb,
annote = "System requirements for computer disk: IBM-compatible
PC; 4MB RAM (8MB recommended); OS/2; C compiler such as
IBM CSet++ or Borland C++ for OS/2; high-density floppy
disk drive; hard disk with 3.1MB free space.",
keywords = "Microcomputers -- Operating systems; Operating systems
(Computers); OS/2 (Computer file)",
author = "Pradeep Dubey and Arvind Krishna and M. J. (Michael
J.) Flynn",
title = "Analytical performance modeling for a spectrum of
multithreaded machines",
type = "Research report",
number = "RC 19549 (85007)",
institution = "IBM T. J. Watson Research Center",
address = "Yorktown Heights, NY, USA",
pages = "27",
day = "3",
month = may,
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The throughput of pipelined processors suffers due to
delays associated with instruction dependencies and
memory latencies. Multithreaded architectures try to
tolerate such delays by sharing the pipeline with
independent instruction threads. This paper proposes a
comprehensive analytical framework to quantitate the
performance potential of a wide spectrum of
multithreaded machines ranging from those that are
capable of switching threads every cycle to those that
switch threads only on long inter-instruction
latencies. For machines in the former category, the
proposed analytic model provides an exact solution for
pipeline utilization which is significantly better than
lower and upper bounds obtainable from simple
approximation techniques. Unlike previously published
analytic models of such systems, the Markov model
developed here accepts a general distribution for the
interlock delays with multiple latencies. For machines
in the latter category, the paper provides an
approximate analytic model which is simpler than
previously published analytic models. The models have
been verified using previously published analytical and
simulation-based results. As compared to the simulation
alternative, the models provide a much quicker estimate
of pipeline utilization as a function of a number of
acknowledgement = ack-nhfb,
keywords = "Computer architecture",
author = "William Lynn Gallagher",
title = "Performance limitations of the {MTS} multithreaded
type = "Thesis ({M.S. in Engineering})",
school = "University of Texas at Austin",
address = "Austin, TX, USA",
pages = "xiv + 101",
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "C. A. Gerlhof and A. Kemper",
title = "A Multi-Threaded Architecture for Prefetching in
Object Bases",
journal = j-LECT-NOTES-COMP-SCI,
volume = "779",
pages = "351--364",
year = "1994",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Wed Sep 15 18:44:20 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1994.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
keywords = "database technology; EDBT; extending database
author = "Ken Gibson",
title = "A {C++} Multitasking Class Library",
journal = j-DDJ,
volume = "19",
number = "5",
pages = "28, 30, 32, 34, 96--98",
month = may,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:49 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Multithreaded applications that currently execute more
than one section of code aren't directly supported by
languages such as C++. Ken presents a C++ multitasking
class library for MS-DOS that lets you implement a
program as a set of concurrent threads.",
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6110P
(Parallel programming)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "C++ multitasking class library; Concurrent execution;
DOS; Embedded processors; Interthread communications;
Locator program; Microsoft C++ 7.0; Multithreaded
applications; Portability; Processor initialization;
Queue class; Real-time device control; Real-time
executive; ROMable image; Scheduler object; Semaphore
class; Simulation; Thread class; Thread
thesaurus = "C listings; Multiprogramming; Object-oriented
programming; Public domain software; Scheduling;
author = "Wolfgang K. Giloi",
title = "Parallel supercomputer architectures and their
programming models",
volume = "20",
number = "10--11",
pages = "1443--1470",
day = "3",
month = nov,
year = "1994",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Fri Aug 6 10:13:51 MDT 1999",
bibsource = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=10-11;
URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=10-11&aid=907",
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C5440
(Multiprocessing systems); C5470 (Performance
evaluation and testing)",
corpsource = "FIRST, GMD Res. Inst. for Comput. Arch. and Software
Eng., Berlin, Germany",
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
keywords = "*T; abstract machine; architectures; DASH; distributed
memory; distributed memory systems; distributed shared;
hardware architecture; latency hiding; latency
minimization; MANNA; memory architectures; message
passing; message passing architectures; multi-threaded
architectures; parallel; parallel supercomputer
architectures; performance; performance evaluation;
physically shared memory systems; programming models;
scalability; shared memory architectures; shared memory
systems; systems; taxonomy; virtual",
treatment = "P Practical",
author = "Matthew Haines and David Cronk and Piyush Mehrotra",
title = "On the design of chant: a talking threads of package:
final report",
number = "194903",
publisher = pub-NTIS,
address = pub-NTIS:adr,
pages = "??",
year = "1994",
LCCN = "NAS 1.26:194903 Govt Pubs",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Shipping list number 94-0861-M.",
series = "NASA contractor report",
acknowledgement = ack-nhfb,
keywords = "message processing; messages",
author = "Burt Halstead and David Callahan and Jack Dennis and
R. S. Nikhil and Vivek Sarkar",
title = "Programming, compilation, and resource management
issues for multithreading (panel session {II})",
journal = j-COMP-ARCH-NEWS,
volume = "22",
number = "1",
pages = "19--33",
month = mar,
year = "1994",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:34 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "J. Holm and A. Lain and P. Banerjee",
title = "Compilation of Scientific Programs into Multithreaded
and Message Driven Computation",
crossref = "IEEE:1994:PSH",
pages = "518--525",
year = "1994",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Robert Iannucci and Anant Agarwal and Bill Dally and
Anoop Gupta and Greg Papadopoulos and Burton Smith",
title = "Architectural and implementation issues for
multithreading (panel session {I})",
journal = j-COMP-ARCH-NEWS,
volume = "22",
number = "1",
pages = "3--18",
month = mar,
year = "1994",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:34 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
editor = "Robert A. Iannucci and others",
title = "Multithreaded computer architecture: a summary of the
state of the art",
volume = "SECS 0281",
publisher = pub-KLUWER,
address = pub-KLUWER:adr,
pages = "xvi + 400",
year = "1994",
ISBN = "0-7923-9477-1",
ISBN-13 = "978-0-7923-9477-8",
LCCN = "QA76.9.A73 M85 1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "The Kluwer international series in engineering and
computer science",
acknowledgement = ack-nhfb,
keywords = "computer architecture; Computer architecture;
Computers -- Design",
author = "K. Jeffay",
title = "On latency management in time-shared operating
crossref = "IEEE:1994:PIW",
pages = "86--90",
year = "1994",
bibdate = "Sat Sep 28 18:52:45 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. Sci., North Carolina Univ., Chapel
Hill, NC, USA",
classification = "C6150J (Operating systems); C6150N (Distributed
keywords = "End-to-end latency; Inter-process communication
interconnections; Latency management; Multi-threaded
applications; Real-Time Mach kernel; Time-shared
operating systems; YARTOS kernel",
thesaurus = "Message passing; Operating systems [computers];
Real-time systems; Scheduling; Time-sharing programs",
author = "John M. {Kanalakis, Jr.}",
title = "Examining {OS/2} 2.1 threads",
journal = j-DDJ,
volume = "19",
number = "1",
pages = "74, 76, 78--79, 96",
month = jan,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 08:52:50 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "The OS/2 2.1 multitasking model is based on the
execution of threads, making it possible for many
sections of a single process to execute simultaneously.
John examines OS/2's thread architecture, specifically,
the scheduling process.",
acknowledgement = ack-nhfb,
classification = "C6150J (Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "Bias implementation; OS/2 2.1 multitasking model;
Round robin scheduling; Scheduling process; Thread
architecture; Threads",
thesaurus = "Multiprogramming; Operating systems [computers];
author = "Michael Kelly",
title = "Multithreading with {OS/2} and {Borland C++}",
journal = j-CCCUJ,
volume = "12",
number = "8",
pages = "67--??",
month = aug,
year = "1994",
ISSN = "1075-2838",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Michael Kelly",
title = "Multithreading with {OS/2} and {Borland C++}",
journal = j-CCCUJ,
volume = "12",
number = "8",
pages = "67--??",
month = aug,
year = "1994",
ISSN = "1075-2838",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Chinhyun Kim",
title = "Functional programming and fine-grain multithreading
for high-performance parallel computing",
type = "Thesis ({Ph.D.})",
school = "University of Southern California",
address = "Los Angeles, CA, USA",
pages = "xv + 150",
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "C. Kim and J.-L. Gaudiot",
title = "A Hierarchical Activation Management Technique for
Fine-Grain Multithreaded Execution",
journal = j-LECT-NOTES-COMP-SCI,
volume = "817",
pages = "577--??",
year = "1994",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:52:14 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Orran Krieger and Michael Stumm and Ron Unrau",
title = "The {Alloc Stream Facility}: a Redesign of
Application-Level Stream {I/O}",
journal = j-COMPUTER,
volume = "27",
number = "3",
pages = "75--82",
month = mar,
year = "1994",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Mon Feb 3 07:28:57 MST 1997",
bibsource = "Compendex database;
abstract = "Many stdio and even Unix I/O applications run faster
when linked to the ASF application-level library. Using
the Alloc Stream Interface improves performance even
acknowledgement = ack-nhfb,
affiliation = "Dept. of Electr. and Comput. Eng., Toronto Univ.,
Ont., Canada",
affiliationaddress = "Toronto, Can",
classification = "723; C6110J (Object-oriented programming); C6110P
(Parallel programming); C6150J (Operating systems)",
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
journalabr = "Computer",
keywords = "Alloc Stream Facility; Alloc stream interface;
Application-level I/O facility; Application-level
library; Application-level stream I/O; ASF; C stdio
library; C++ stream I/O; Computer operating systems;
Concurrency; I/O-intensive applications; Input output
programs; Mapped files; Multithreaded applications;
Object-oriented structure; Parallel applications;
Parallel systems; Performance improvements; Popular I/O
interfaces; Sequential byte stream; Standard Unix
systems; Stdio; System behavior; UNIX",
thesaurus = "Input-output programs; Object-oriented methods;
Parallel programming; Unix",
author = "James Laudon and Anoop Gupta and Mark Horowitz",
title = "Interleaving: a multithreading technique targeting
multiprocessors and workstations",
journal = j-SIGPLAN,
volume = "29",
number = "11",
pages = "308--318",
month = nov,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:57 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
note = "Co-published in {\em Operating Systems Review}, {\bf
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p308-laudon/",
abstract = "There is an increasing trend to use commodity
microprocessors as the compute engines in large-scale
multiprocessors. However, given that the majority of
the microprocessors are sold in the workstation market,
not in the multiprocessor market, it is only natural
that architectural features that benefit only
multiprocessors are less likely to be adopted in
commodity microprocessors. In this paper, we explore
multiple-context processors, an architectural technique
proposed to hide the large memory latency in
multiprocessors. We show that while current
multiple-context designs work reasonably well for
multiprocessors, they are ineffective in hiding the
much shorter uniprocessor latencies using the limited
parallelism found in workstation environments. We
propose an alternative design that combines the best
features of two existing approaches, and present
simulation results that show it yields better
performance for both multiprogrammed workloads on a
workstation and parallel applications on a
multiprocessor. By addressing the needs of the
workstation environment, our proposal makes multiple
contexts more attractive for commodity
acknowledgement = ack-nhfb,
classification = "C5430 (Microcomputers); C5440 (Multiprocessing
systems); C6120 (File organisation); C6150J (Operating
conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
conftitle = "Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
corpsource = "Comput. Syst. Lab., Stanford Univ., CA, USA",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "architectural features; commodity microprocessors;
compute engines; design; interleaved storage;
interleaving; large memory latency; large-scale
multiprocessors; measurement; multiple-context designs;
multiple-context processors; multiprocessing systems;
multiprogrammed workloads; multiprogramming;
multithreading technique; parallel applications;
parallel uniprocessor latencies; performance; theory;
uniprocessor latencies; workstations",
sponsororg = "ACM; IEEE Comput. Soc",
subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER
SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4}
Computer Systems Organization, PERFORMANCE OF
treatment = "P Practical",
author = "John Launchbury and Simon L. {Peyton Jones}",
title = "Lazy Functional State Threads",
journal = j-SIGPLAN,
volume = "29",
number = "6",
pages = "24--35",
month = jun,
year = "1994",
ISBN = "0-89791-598-4",
ISBN-13 = "978-0-89791-598-4",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:51 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/178243/index.html;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/178243/p24-launchbury/",
abstract = "Some algorithms make critical internal use of
updatable state, even though their external
specification is purely functional. Based on earlier
work on monads, we present a way of securely
encapsulating stateful computations that manipulate
multiple, named, mutable objects, in the context of a
non-strict, purely-functional language. The security of
the encapsulation is assured by the type system, using
parametricity. Intriguingly, this parametricity
requires the provision of a (single) constant with a
rank-2 polymorphic type.",
acknowledgement = ack-nhfb,
annote = "Published as part of the Proceedings of PLDI'94.",
classification = "C4240 (Programming and algorithm theory); C6110
(Systems analysis and programming); C6140D (High level
conflocation = "Orlando, FL, USA; 20-24 June 1994",
conftitle = "ACM SIGPLAN '94 Conference on Programming Language
Design and Implementation (PLDI)",
corpsource = "Glasgow Univ., UK",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "algorithms; encapsulation; external specification;
functional language; functional programming; high level
languages; languages; lazy functional state threads;
monads; mutable objects; nonstrict purely-functional
language; parametricity; rank-2 polymorphic type;
security; specification; stateful computations; type
system; type theory; updatable state",
sponsororg = "ACM",
subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language
Constructs and Features, Procedures, functions, and
subroutines. {\bf D.3.2} Software, PROGRAMMING
LANGUAGES, Language Classifications, Applicative
(functional) languages. {\bf F.3.3} Theory of
of Program Constructs, Type structure. {\bf F.4.1}
LANGUAGES, Mathematical Logic, Lambda calculus and
related systems.",
treatment = "P Practical; T Theoretical or Mathematical",
author = "Ben Lee and A. R. Hurson",
title = "Dataflow Architectures and Multithreading",
journal = j-COMPUTER,
volume = "27",
number = "8",
pages = "27--39",
month = aug,
year = "1994",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Mon Feb 3 07:28:57 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Contrary to initial expectations, implementing
dataflow computers has presented a. monumental
challenge. Now, however, multithreading offers a.
viable alternative for buliding hybrid architectures
that exploit parallelism.",
acknowledgement = ack-nhfb,
affiliation = "Dept. of Electr. and Comput. Eng., Oregon State Univ.,
Corvallis, OR, USA",
classification = "C5220P (Parallel architecture); C5440
(Multiprocessing systems)",
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
keywords = "Compilers; Concurrency; Data dependencies; Dataflow
architectures; Dataflow machines; Functional semantics;
Hybrid architectures; Id; Imperative languages;
Multithreading; Parallel functional languages; Parallel
machines; Parallelism; Programmability; Semantics; Side
effects; SISAL; Source code; Streams and Iterations in
a Single Assignment Language; Syntax; Threaded Abstract
thesaurus = "Parallel architectures; Parallel processing",
author = "Jochen Liedtke",
title = "A short note on implementing thread exclusiveness and
address space locking",
journal = j-OPER-SYS-REV,
volume = "28",
number = "3",
pages = "38--42",
month = jul,
year = "1994",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:46 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "David Ta-Chang Lu",
title = "A multithreaded processor for massively parallel
type = "Thesis ({M.S.})",
school = "University of California, Riverside",
address = "Riverside, CA, USA",
pages = "vii + 42",
year = "1994",
LCCN = "QA76.58 .L88 1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "computer algorithms; Computer algorithms; computer
architecture; Computer architecture; dissertations;
dissertations, academic -- UCR -- computer science;
parallel computers; Parallel computers; Parallel
processing (Electronic computers); parallel processing
(electronic computers); Science -- Dissertations;
University of California, Riverside. -- Dept. of
Computer; University of California, Riverside. Dept. of
Computer Science",
author = "Dan C. Marinescu and John R. Rice",
title = "On High Level Characterization of Parallelism",
journal = j-J-PAR-DIST-COMP,
volume = "20",
number = "1",
pages = "107--113",
month = jan,
year = "1994",
DOI = "https://doi.org/10.1006/jpdc.1994.1011",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:53 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production;
acknowledgement = ack-nhfb,
classification = "C4240P (Parallel programming and algorithm theory);
C5220P (Parallel architecture); C5470 (Performance
evaluation and testing)",
corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "communication complexity; load balancing; massively
parallel; parallel architectures; parallel execution;
parallelism; performance analysis; performance
evaluation; speedup; systems; threads of control",
treatment = "T Theoretical or Mathematical",
author = "{Mix Software, Inc}",
title = "Using {Multi-C}: a portable multithreaded {C}
programming library",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
pages = "vi + 257",
year = "1994",
ISBN = "0-13-606195-8",
ISBN-13 = "978-0-13-606195-3",
LCCN = "QA76.73.C15 U85 1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "System requirements for computer disk: IBM-compatible
PC; DOS; Mix, Borland, or Microsoft-compatible C/C++
acknowledgement = ack-nhfb,
annote = "System requirements for computer disk: IBM-compatible
PC; DOS; Mix, Borland, or Microsoft-compatible C/C++
keywords = "C (computer program language); C (Computer program
language); Microcomputers -- Programming languages",
author = "Bodhisattwa Mukherjee and Greg Eisenhauer and Kaushik
title = "A machine independent interface for lightweight
journal = j-OPER-SYS-REV,
volume = "28",
number = "1",
pages = "33--47",
month = jan,
year = "1994",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "S. S. Nemawarkar and R. Govindarajan and G. R. Gao and
V. K. Agarwal",
title = "Performance of Interconnection Network in
Multithreaded Architectures",
journal = j-LECT-NOTES-COMP-SCI,
volume = "817",
pages = "823--??",
year = "1994",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:52:14 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Rishiyur S. Nikhil",
title = "A Multithreaded Implementation of {Id} using {P-RISC}
journal = j-LECT-NOTES-COMP-SCI,
volume = "768",
pages = "390--??",
year = "1994",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon May 13 11:52:14 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "John Norwood and Shankar Vaidyanathan",
title = "Symmetric Multiprocessing for {PCs}",
journal = j-DDJ,
volume = "19",
number = "1",
pages = "80, 82--85, 98--99",
month = jan,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:46 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Our authors focus on multithreaded application
development for single-processor and
symmetric-multiprocessor machines under Windows NT. In
doing so, they present Fortran interface statements for
the Win32 console API and a black-box solution for
calling 32-bit DLLs from 16-bit applications under
acknowledgement = ack-nhfb,
classification = "C6150J (Operating systems); C6150N (Distributed
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "16-Bit applications; 32-Bit DLLs; Black-box solution;
Fortran interface statements; Multithreaded
application; Single processor machines;
Symmetric-multiprocessor machines; Win32 console API;
Windows NT",
thesaurus = "C listings; Multiprocessing programs;
author = "Norman Ramsey",
title = "Correctness of trap-based breakpoint implementations",
crossref = "ACM:1994:CRP",
pages = "15--24",
year = "1994",
bibdate = "Mon May 3 12:50:22 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/174675/p15-ramsey/",
abstract = "It is common for debuggers to implement breakpoints by
a combination of planting traps and single stepping.
When the target program contains multiple threads of
execution, a debugger that is not carefully implemented
may miss breakpoints. This paper gives a formal model
of a breakpoint in a two-threaded program. The model
describes correct and incorrect breakpoint
implementations. Automatic search of the model's state
space shows that the correct implementation does miss a
breakpoint. The results apply even to debuggers like
dbx and gdb, which are apparently for single-threaded
programs; when the user evaluates an expression
containing function calls, the debugger executes the
call in the target address space, in effect creating a
new thread.",
acknowledgement = ack-nhfb,
keywords = "languages; measurement; theory",
subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
and Debugging. {\bf F.3.1} Theory of Computation,
Verifying and Reasoning about Programs.",
author = "John Rodley",
title = "{OS/2} and {UnixWare} Interprocess Communication",
journal = j-DDJ,
volume = "19",
number = "5",
pages = "78--82, 84, 107--109",
month = may,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:49 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Interprocess communication isn't portable between
IBM's OS/2 2.1 and Novell's UnixWare 1.1. But even
through the implementation details differ greatly, the
two systems do share ways of thinking about IPC. John
looks at IPC under OS/2 and UnixWare to see what common
ground exists.",
acknowledgement = ack-nhfb,
classification = "C6150J (Operating systems); C6150N (Distributed
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "APIs; Applications programming; Functionality; IBM
OS/2 2.1; Implementation details; Independent
processes; Interprocess communication; IPC models;
Multitasking operating systems; Novell UnixWare 1.1;
thesaurus = "C listings; Multiprocessing systems; Operating systems
[computers]; Unix",
author = "Jang Chung Shee and Chao Chin Wu and Lin Wen You and
Cheng Chen",
title = "Design of a multithread architecture and its parallel
simulation and evaluation environment",
crossref = "Anonymous:1994:ICS",
pages = "69--76 (vol. 1)",
year = "1994",
bibdate = "Sun Dec 22 10:19:23 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
affiliation = "Inst. of Comput. Sci. and Inf. Eng., Nat. Chiao Tung
Univ., Hsinchu, Taiwan",
classification = "C5220P (Parallel architecture); C6115 (Programming
support); C6185 (Simulation techniques)",
keywords = "Context switch; Integrated multiprocessing simulation
environment; Multithread architecture; Parallel
simulation; Parallel simulation and evaluation
environment; Parallel Virtual Machine; SUN SPARC
workstations; Thread-related instructions",
thesaurus = "Digital simulation; Parallel architectures;
Programming environments",
author = "Simon E. Spero",
title = "{MDMA} --- Multithreaded Daemon for Multimedia
crossref = "Anonymous:1994:PIW",
pages = "??--??",
year = "1994",
bibdate = "Mon Oct 23 09:15:37 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Mark S. Squillante",
title = "Analytic modeling of processor utilization in
multithreaded processor architectures",
type = "Research report",
number = "RC 19543 (84999)",
institution = "IBM T. J. Watson Research Center",
address = "Yorktown Heights, NY, USA",
pages = "9",
month = apr,
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we develop an analytic model of
processor utilization in multithreaded processor
architectures that supports both serial and parallel
processing of memory requests. The system is modeled as
a finite, continuous-time Markov chain whose solution
can be obtained efficiently. Although it applies more
generally, our modeling approach supports an important
class of probability distributions that can be used to
approximate the distributions of interest with
sufficient accuracy in most practical cases. This
results in an efficient and accurate model across a
wide variety of system environments.",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors",
author = "Avram K. Tetewsky",
title = "{GUI} Development for Real-Time Applications",
journal = j-DDJ,
volume = "19",
number = "6",
pages = "28, 30, 32, 36, 38, 40--41",
month = jun,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:49 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Although they take radically different approaches,
both ControlCalc and LabView are designed for building
GUI-based, real-time control applications.",
acknowledgement = ack-nhfb,
affiliation = "Draper (C.S.) Lab., Cambridge, MA, USA",
classification = "C6115 (Programming support); C6130B (Graphics
techniques); C6180G (Graphical user interfaces); C7420
(Control engineering)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "386/OS-9000; 680X0/OS9; ControlCalc Version 1.78;
G-Windows 2.3 windowing package; GUI development;
LabView 3.0; Multipage-spreadsheet paradigm;
Multithreaded program; National Instruments; OS-9000
1.3; PC-based tools; Rapid prototyping; Real-time
control application; RTWare; Windows data-flow driven
thesaurus = "Computerised control; Graphical user interfaces;
Real-time systems; Software tools",
author = "Radhika Thekkath and Susan J. Eggers",
title = "The effectiveness of multiple hardware contexts",
journal = j-SIGPLAN,
volume = "29",
number = "11",
pages = "328--337",
month = nov,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:57 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p328-thekkath/",
abstract = "Multithreaded processors are used to tolerate long
memory latencies. By executing threads loaded in
multiple hardware contexts, an otherwise idle processor
can keep busy, thus increasing its utilization.
However, the larger size of a multi-thread working set
can have a negative effect on cache conflict misses. In
this paper we evaluate the two phenomena together,
examining their combined effect on execution time. The
usefulness of multiple hardware contexts depends on:
program data locality, cache organization and degree of
multiprocessing. Multiple hardware contexts are most
effective on programs that have been optimized for data
locality. For these programs, execution time dropped
with increasing contexts, over widely varying
architectures. With unoptimized applications, multiple
contexts had limited value. The best performance was
seen with only two contexts, and only on uniprocessors
and small multiprocessors. The behavior of the
unoptimized applications changed more noticeably with
variations in cache associativity and cache hierarchy,
unlike the optimized programs. As a mechanism for
exploiting program parallelism, an additional processor
is clearly better than another context. However, there
were many configurations for which the addition of a
few hardware contexts brought as much or greater
performance than a larger multiprocessor with fewer
than the optimal number of contexts.",
acknowledgement = ack-nhfb,
classification = "C5320G (Semiconductor storage); C5440
(Multiprocessing systems); C6110P (Parallel
programming); C6120 (File organisation); C6150N
(Distributed systems software)",
conflocation = "San Jose, CA, USA; 4-7 Oct. 1994",
conftitle = "Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
corpsource = "Dept. of Comput. Sci. and Eng., Washington Univ.,
Seattle, WA, USA",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "cache associativity; cache conflict misses; cache
hierarchy; cache organization; cache storage; data
locality; design; long; long memory latencies;
measurement; multi-thread working set; multiple
hardware contexts; multiprocessing; multiprocessing
systems; multithreaded processors; parallel
programming; performance; program data locality;
program parallelism; storage management; theory;
unoptimized applications",
sponsororg = "ACM; IEEE Comput. Soc",
subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER
SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4}
Computer Systems Organization, PERFORMANCE OF
treatment = "P Practical",
author = "R. Thekkath and S. J. Eggers",
title = "Impact of sharing-based thread placement on
multithreaded architectures",
journal = j-COMP-ARCH-NEWS,
volume = "22",
number = "2",
pages = "176--186",
month = apr,
year = "1994",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:40 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Xiaobao Wang",
title = "Multithreaded architecture: design and performance
volume = "3016",
type = "Thesis ({M. S.})",
school = "Department of Electrical Engineering, University of
Hawaii at Manoa",
address = "Manoa, HI, USA",
pages = "59",
year = "1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Theses for the degree of Master of Science (University
of Hawaii at Manoa)",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Multiprocessors",
author = "Al Williams",
title = "{NT-Style} Threads For {MS-DOS}",
journal = j-DDJ,
volume = "19",
number = "2",
pages = "74, 76--77",
month = feb,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:47 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Al uses Phar Lap's TNT 386/DOS-Extender to implement
NT-style threads in a DOS program that removes a
directory tree. Instead of recursing down the tree, the
program (which works with NT and TNT) processes
directories in parallel.",
acknowledgement = ack-nhfb,
classification = "C6110 (Systems analysis and programming); C6150C
(Compilers, interpreters and other processors); C6150J
(Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "BIOS interrupts; C library functions; Compiling; DOS;
Memory allocation; MS-DOS; Multiple threads;
Multithreading; Phar Lap; Specification; TNT
386/DOS-Extender; Win32 programming API; Win32-base
API; Windows; Windows NT",
thesaurus = "Interrupts; Multiprogramming; Operating systems
[computers]; Program compilers",
author = "Al Williams",
title = "{NT-Style} Threads For {MS-DOS}",
journal = j-DDJ,
volume = "19",
number = "2",
pages = "74, 76--77",
month = feb,
year = "1994",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:15:47 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Al uses Phar Lap's TNT 386/DOS-Extender to implement
NT-style threads in a DOS program that removes a
directory tree. Instead of recursing down the tree, the
program (which works with NT and TNT) processes
directories in parallel.",
acknowledgement = ack-nhfb,
classification = "C6110 (Systems analysis and programming); C6150C
(Compilers, interpreters and other processors); C6150J
(Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "BIOS interrupts; C library functions; Compiling; DOS;
Memory allocation; MS-DOS; Multiple threads;
Multithreading; Phar Lap; Specification; TNT
386/DOS-Extender; Win32 programming API; Win32-base
API; Windows; Windows NT",
thesaurus = "Interrupts; Multiprogramming; Operating systems
[computers]; Program compilers",
author = "W. F. Wong and E. Goto",
title = "A Simulation Study on the Interactions Between
Multithreaded Architectures and the Cache",
volume = "6",
number = "2",
pages = "343--??",
year = "1994",
ISSN = "0129-0533",
bibdate = "Mon Feb 25 11:19:24 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
OCLC Article1st database",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Speed Computing
author = "Anonymous",
title = "{HP-UX 10.0 will be unveiled this week, with newly
tuned kernel and I\slash {O} paths, plus a
multithreaded NFS implementation}",
volume = "168",
pages = "34--??",
month = feb,
year = "1995",
ISSN = "1061-0839",
bibdate = "Fri Jan 26 17:24:01 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Open Systems Today",
author = "Anonymous",
title = "{HP-UX 10.0 will be unveiled this week, with newly
tuned kernel and I\slash {O} paths, plus a
multithreaded NFS implementation}",
volume = "168",
pages = "34--??",
month = feb,
year = "1995",
ISSN = "1061-0839",
bibdate = "Fri Jan 26 17:24:01 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Open Systems Today",
author = "Mary Baker",
title = "Going threadbare (panel session): sense or sedition? a
debate on the threads abstraction",
journal = j-OPER-SYS-REV,
volume = "29",
number = "5",
pages = "227--227",
month = dec,
year = "1995",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:55 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Henry G. Baker",
title = "``Use-once'' variables and linear objects: storage
management, reflection and multi-threading",
journal = j-SIGPLAN,
volume = "30",
number = "1",
pages = "45--52",
month = jan,
year = "1995",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:16:59 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Prithviraj Banerjee and John A. Chandy and Manish
Gupta and Eugene W. {Hodges IV} and John G. Holm and
Antonio Lain and Daniel J. Palermo and Shankar
Ramaswamy and Ernesto Su",
title = "The {Paradigm} compiler for distributed-memory
journal = j-COMPUTER,
volume = "28",
number = "10",
pages = "37--47",
month = oct,
year = "1995",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Mon Feb 3 07:21:26 MST 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Illinois Univ., Urbana, IL, USA",
affiliationaddress = "Urbana-Champaign, IL, USA",
classification = "722.3; 722.4; 723.1; 723.2; C6110P (Parallel
programming); C6150C (Compilers, interpreters and other
processors); C6150N (Distributed systems software)",
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
journalabr = "Computer",
keywords = "Address space; Automatic parallelization; Codes
(symbols); Computational methods; Computer hardware;
Computer programming; Data communication systems; Data
parallelism; Data partitioning; Data processing;
Distributed memory multicomputer; Distributed-memory
multicomputers; Efficient software; Explicitly managed
communication; Functional parallelism; Irregular
computations; Manually distribution; Massively parallel
computers; Multithreading; Paradigm compiler; Parallel
algorithms; Parallel processing systems; Parallel
programming; Program compilers; Regular computations;
Sequential programs; Supercomputers",
thesaurus = "Distributed memory systems; Parallel machines;
Parallel programming; Parallelising compilers; Program
author = "Lubomir Bic and Guang R. Gao and Jean-Luc Gaudiot",
title = "Advanced topics in dataflow computing and
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "x + 450",
year = "1995",
ISBN = "0-8186-6541-6, 0-8186-6540-8 (paperback)",
ISBN-13 = "978-0-8186-6541-7, 978-0-8186-6540-0 (paperback)",
LCCN = "QA76.9.A73A356 1994",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Data structures (Computer
science); Parallel processing (Electronic computers)",
author = "Robert D. Blumofe and Christopher F. Joerg and Bradley
C. Kuszmaul and Charles E. Leiserson and Keith H.
Randall and Yuli Zhou",
title = "{Cilk}: an efficient multithreaded runtime system",
journal = j-SIGPLAN,
volume = "30",
number = "8",
pages = "207--216",
month = aug,
year = "1995",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:08 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Cilk (pronounced `silk') is a C-based runtime system
for multithreaded parallel programming. In this paper,
we document the efficiency of the Cilk work-stealing
scheduler, both empirically and analytically. We show
that on real and synthetic applications, the `work' and
`critical path' of a Cilk computation can be used to
accurately model performance. Consequently, a Cilk
programmer can focus on reducing the work and critical
path of his computation, insulated from load balancing
and other runtime scheduling issues. We also prove that
for the class of `fully strict' (well-structured)
programs, the Cilk scheduler achieves space, time, and
communication bounds all within a constant factor of
optimal. The Cilk runtime system currently runs on the
Connection Machine CM5 massively parallel processor
(MPP), the Intel Paragon MPP, the Silicon Graphics
Power Challenge symmetric multiprocessor (SMP), and the
MIT Phish network of workstations. Applications written
in Cilk include protein folding, graphic rendering,
backtrack searching, and the *Socrates chess program,
which won third prize in the 1994 ACM International
Computer Chess Championship.",
acknowledgement = ack-nhfb,
affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
classification = "C6110P (Parallel programming); C6150C (Compilers,
interpreters and other processors); C6150N (Distributed
systems software)",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "*Socrates chess program; Accurate performance
modelling; Backtrack searching; C-based multithreaded
runtime system; Cilk; Communication bounds; Connection
Machine CM5; Critical path; Efficiency; Fully strict
programs; Graphic rendering; Intel Paragon; Load
balancing; MIT Phish workstation network; Parallel
programming; Protein folding; Runtime scheduling
issues; Silicon Graphics Power Challenge; Space bounds;
Time bounds; Well-structured programs; Work-stealing
thesaurus = "Backtracking; Biology computing; Molecular
configurations; Parallel programming; Processor
scheduling; Program interpreters; Proteins; Rendering
[computer graphics]",
author = "Robert D. (Robert David) Blumofe",
title = "Executing multithreaded programs efficiently",
type = "Thesis ({Ph.D.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "145",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "T. Bubeck and M. Hiller and W. Kuchlin and W.
title = "Distributed symbolic computation with {DTS}",
crossref = "Ferreira:1995:PAI",
pages = "231--248",
year = "1995",
bibdate = "Sun Dec 22 10:19:23 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography1990.bib;
acknowledgement = ack-nhfb,
affiliation = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
classification = "C4130 (Interpolation and function approximation);
C4240P (Parallel programming and algorithm theory);
C6110P (Parallel programming); C6115 (Programming
support); C6130S (Data security); C6150N (Distributed
systems software)",
keywords = "Anonymous compute servers; Asynchronous RPC
abstraction; C threads interface; Cryptosystem;
Distributed symbolic computation; Distributed threads
system; DTS; Fork/join parallel programming; Highly
data-dependent algorithm parallelisation; Irregular
algorithm parallelisation; Multiprocessor workstation;
Multithreading; Parallel long integer multiplication;
Parallel multi-variate polynomial resultant
computation; Performance results; Programming
environment; PVM; Shared memory threads",
thesaurus = "Arithmetic; Cryptography; Distributed memory systems;
Multiprocessing programs; Multiprocessing systems;
Parallel algorithms; Parallel programming; Polynomials;
Programming environments; Remote procedure calls;
Shared memory systems; Software performance evaluation;
Symbol manipulation; Workstations",
author = "G. T. Byrd and M. A. Holliday",
title = "Multithreaded processor architectures",
journal = j-IEEE-SPECTRUM,
volume = "32",
number = "8",
pages = "38--46",
month = aug,
year = "1995",
DOI = "https://doi.org/10.1109/6.402166",
ISSN = "0018-9235 (print), 1939-9340 (electronic)",
ISSN-L = "0018-9235",
bibdate = "Thu Jan 16 07:37:23 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeespectrum1990.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Spectrum",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6",
keywords = "Application software; Computer architecture; computer
architecture; Delay; Hardware; High performance
computing; idle cycles; instruction streams; Job shop
scheduling; Large-scale systems; latency;
microprocessor chips; multiple concurrent execution
streams; multiprogramming; multithreaded processor
architectures; performance; Registers; single
processor; Supercomputers; time-consuming operation",
author = "F. Caudal and B. Lecussan",
title = "Design and Evaluation of a Multi-Threaded Architecture
for Parallel Graph Reduction",
journal = j-LECT-NOTES-COMP-SCI,
volume = "964",
pages = "411--??",
year = "1995",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat May 11 13:45:32 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Henry Cejtin and Suresh Jagannathan and Richard
title = "Higher-Order Distributed Objects",
journal = j-TOPLAS,
volume = "17",
number = "5",
pages = "704--739",
month = sep,
year = "1995",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jan 5 07:58:42 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/213986.html",
abstract = "We describe a distributed implementation of Scheme
that permits efficient transmission of higher-order
objects such as closures and continuations. The
integration of distributed communication facilities
within a higher-order programming language engenders a
number of new abstractions and paradigms for
distributed computing. Among these are user-specified
load-balancing and migration policies for threads,
incrementally linked distributed computations, and
parameterized client-server applications. To our
knowledge, this is the first distributed dialect of
Scheme (or a related language) that addresses
lightweight communication abstractions for higher-order
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "experimentation; languages",
subject = "{\bf D.1.3}: Software, PROGRAMMING TECHNIQUES,
Concurrent Programming, Distributed programming. {\bf
D.3.2}: Software, PROGRAMMING LANGUAGES, Language
Classifications, Applicative languages. {\bf D.3.2}:
Classifications, Extensible languages. {\bf D.3.3}:
Software, PROGRAMMING LANGUAGES, Language Constructs
and Features, Concurrent programming structures. {\bf
D.3.2}: Software, PROGRAMMING LANGUAGES, Language
Classifications, SCHEME.",
author = "C.-Y. Chang and J.-P. Sheu",
title = "Compile-time scheduling of multithread with data
localities on multiple vector processors",
journal = j-CPE,
volume = "7",
number = "5",
pages = "349--369",
month = aug,
year = "1995",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 05:40:19 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency, practice and experience",
author = "C.-Y. Chang and J.-P. Sheu",
title = "Compile-time scheduling of multithread with data
localities on multiple vector processors",
journal = j-CPE,
volume = "7",
number = "5",
pages = "349--369",
month = aug,
year = "1995",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 05:40:19 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Concurrency, practice and experience",
author = "Yong-Kim Chong and Kai Hwang",
title = "Performance Analysis of Four Memory Consistency Models
for Multithreaded Multiprocessors",
volume = "6",
number = "10",
pages = "1085--1099",
month = oct,
year = "1995",
DOI = "https://doi.org/10.1109/71.473517",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Nov 6 12:31:15 MST 1998",
bibsource = "Compendex database;
URL = "http://www.computer.org/tpds/td1995/l1085abs.htm",
acknowledgement = ack-nhfb,
affiliation = "Nanyang Technological Univ",
affiliationaddress = "Singapore, Singapore",
classification = "716.1; 722.1; 722.3; 722.4; 921.4; 922.1; C1160
(Combinatorial mathematics); C5440 (Multiprocessing
systems); C5470 (Performance evaluation and testing)",
corpsource = "Sch. of Electr. and Electron. Eng., Nanyang Technol.
Univ., Singapore",
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
journalabr = "IEEE Trans Parallel Distrib Syst",
keywords = "attributes; Bandwidth; Buffer storage; cache
interferences; Computer networks; Computer selection
and evaluation; Computer simulation; Context switching;
Data communication systems; Data storage equipment;
Distributed shared memory; distributed shared memory
models; embedded Markov chains; evaluation; Latency
hiding techniques; Markov processes; memory consistency
models; Memory consistency models; memory event
reordering; multiprocessing systems; Multiprocessing
systems; multithreaded multiprocessors; Multithreaded
multiprocessors; performance; Performance; performance
analysis; Performance evaluation; Petri net models;
Petri nets; Processors; rate; scalable multiprocessors;
Scalable multiprocessors; stochastic timed Petri nets;
Stochastic timed Petri nets; synchronisation;
synchronization; Synchronization; Telecommunication
traffic; write buffers",
treatment = "A Application; P Practical",
author = "Nikos Chrisochoides",
title = "Multithreaded model for dynamic load balancing
parallel adaptive {PDE} computations",
type = "Technical report",
number = "CTC95, TR221",
institution = "Cornell Theory Center, Cornell University",
address = "Ithaca, NY, USA",
pages = "23",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
alttitle = "Multi-threaded model for dynamic load balancing
parallel adaptive PDE computations",
author = "Nikos Chrisochoides",
title = "Multithreaded model for dynamic load balancing
parallel adaptive {PDE} computations",
type = "{NASA} contractor report 198244; {ICASE} report
institution = "Institute for Computer Applications in Science and
Engineering NASA Langley Research Center",
address = "Hampton, VA, USA",
pages = "i + 23 + i",
month = nov,
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "To appear in Applied Numerical Mathematics Journal.",
abstract = "We present a multithreaded model for the dynamic
load-balancing of numerical, adaptive computations
required for the solution of Partial Differential
Equations (PDEs) on multiprocessors. Multithreading is
used as a means of exploring concurrency at the
processor level in order to tolerate synchronization
costs inherent to traditional (non-threaded) parallel
adaptive PDE solvers. Our preliminary analysis for
parallel, adaptive PDE solvers indicates that
multithreading can be used as a mechanism to mask
overheads required for the dynamic balancing of
processor workloads with computations required for the
actual numerical solution of the PDEs. Also,
multithreading can simplify the implementation of
dynamic load-balancing algorithms, a task that is very
difficult for traditional data parallel adaptive PDE
computations. Unfortunately, multithreading does not
always simplify program complexity, often makes code
re-usability difficult, and increases software
acknowledgement = ack-nhfb,
annote = "Supported in part by an Alex Nason Prize Award
Supported in part by the NSF, supplemented by ARPA.
Supported in part by the National Aeronautics and Space
keywords = "Differential equations, Partial; Parallel programming
(Computer science); Synchronization; Threads (Computer
author = "S. R. Coorg",
title = "Partitioning Non-Strict Functional Languages for
Multi-Threaded Code Generation",
journal = j-LECT-NOTES-COMP-SCI,
volume = "983",
pages = "82--??",
year = "1995",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat May 11 13:45:32 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Ravindra Divekar",
title = "The impact of multithreading on the performance of
superscalar processors",
type = "Thesis ({M.A.})",
number = "2117",
school = "State University of New York at Binghamton, Thomas J.
Watson School of Engineering and Applied Science",
address = "Binghamton, NY, USA",
pages = "vi + 73",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Master's theses / State University of New York at
acknowledgement = ack-nhfb,
keywords = "Operating systems (Computers)",
author = "M. N. Dorojevets and V. G. Oklobdzija",
title = "Multithreaded Decoupled Architecture",
volume = "7",
number = "3",
pages = "465--??",
year = "1995",
ISSN = "0129-0533",
bibdate = "Mon Feb 25 11:19:23 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
OCLC Article1st database",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Speed Computing
author = "Doron Drusinsky",
title = "Visually Designing Embedded-Systems Applications",
journal = j-DDJ,
volume = "20",
number = "6",
pages = "62, 64, 66, 68, 104--106",
month = jun,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Doron describes how design tools that incorporate
object-oriented inheritance and extended state diagrams
(the visual counterpart of finite state machines) can
be used to build control systems.",
acknowledgement = ack-nhfb,
affiliation = "R-Active Concepts and Co-Active Concepts, Ltd",
classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C5140
(Firmware); C6110J (Object-oriented programming);
C6110P (Parallel programming); C6140D (High level
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "C; C (programming language); C++ listing; Codes
(SYMBOLS); Computer aided software engineering;
Computer software; Computer systems; Concurrency;
Digital answering machine; Embedded systems;
Embedded-systems application; ESD; Extended state
diagram; Extended state diagrams; Finite automata;
Finite state diagram; Firmware; Hierarchy; Inheritance;
Interactive computer systems; Microcode;
Multithreading; Object oriented programming;
Operating-system-like routine; Reactive system; Real
time system; State diagram; Synchronization; Systems
analysis; Visual synchronization; Visually designing",
pagecount = "4",
thesaurus = "C language; C listings; Firmware; Object-oriented
programming; Real-time systems",
author = "Pradeep Dubey",
title = "Single-program speculative multithreading ({SPSM})
architecture: compiler-assisted fine-grained
type = "Research report",
number = "RC 19928 (88233)",
institution = "IBM T. J. Watson Research Center",
address = "Yorktown Heights, NY, USA",
pages = "25",
day = "6",
month = feb,
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Recent limit studies on instruction-level parallel
processing, based on non-numeric applications, have
reported significant performance gains from speculative
execution of multiple control flows. This paper
describes a new single-program speculative
multithreading (SPSM) architecture, which can be viewed
as an extension of any existing single-thread
architecture. It enables speculative fetch, decode, and
execution from multiple program locations
simultaneously. Instruction threads are generated at
compile-time using control dependence analysis.
Inter-thread data dependences are also analyzed at
compile-time. However, resource binding of instructions
is performed only at run time, to offer binary
compatibility across different implementations. New
thread generation algorithms, being prototyped in a
version of the TOBEY compiler, are also described. The
SPSM architecture includes novel fork/suspend
instructions which are used to identify independent
instruction threads, and also to specify compile-time
control flow speculations associated with inter-thread
acknowledgement = ack-nhfb,
keywords = "Computer architecture",
author = "Jim Dugger",
title = "Multithreading in {C++}",
journal = j-CCCUJ,
volume = "13",
number = "11",
pages = "23--??",
month = nov,
year = "1995",
ISSN = "1075-2838",
bibdate = "Fri Aug 30 16:52:23 MDT 1996",
bibsource = "http://www.cuj.com/cbklist.htm;
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "N. Elmasri and H. H. J. Hum and G. R. Gao",
title = "The Threaded Communication Library: Preliminary
Experiences on a Multiprocessor with Dual-Processor
crossref = "ACM:1995:CPI",
pages = "195--199",
year = "1995",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "John English",
title = "Multithreading in {C++}",
journal = j-SIGPLAN,
volume = "30",
number = "4",
pages = "21--28",
month = apr,
year = "1995",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:03 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Thomas Fahringer and Matthew Haines and Piyush
title = "On the utility of threads for data parallel
number = "198155",
publisher = pub-NTIS,
address = pub-NTIS:adr,
pages = "??",
year = "1995",
LCCN = "NAS 1.26:198155 Govt Pubs",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Shipping list number 96-0037-M",
series = "NASA contractor report",
acknowledgement = ack-nhfb,
keywords = "computation; interprocessor communication; parallel
programming; particle in cell technique; relaxation
method (mathematics)",
author = "T. Fahringer and M. Haines and P. Mehrotra",
title = "On the Utility of Threads for Data Parallel
crossref = "ACM:1995:CPI",
pages = "51--59",
year = "1995",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "John Field and G. Ramalingam and Frank Tip",
title = "Parametric program slicing",
crossref = "ACM:1995:CRP",
pages = "379--392",
year = "1995",
bibdate = "Mon May 3 12:52:30 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p379-field/",
abstract = "Program slicing is a technique for isolating
computational threads in programs. In this paper, we
show how to mechanically extract a family of practical
algorithms for computing slices directly from semantic
specifications. These algorithms are based on combining
the notion of {\em dynamic dependence tracking\/} in
term rewriting systems with a program representation
whose behavior is defined via an equational logic. Our
approach is distinguished by the fact that changes to
the behavior of the slicing algorithm can be
accomplished through simple changes in rewriting rules
that define the semantics of the program
representation. Thus, e.g., different notions of
dependence may be specified, properties of
language-specific datatypes can be exploited, and
various time, space, and precision tradeoffs may be
made. This flexibility enables us to generalize the
traditional notions of static and dynamic slices to
that of a {\em constrained\/} slice, where any subset
of the inputs of a program may be supplied.",
acknowledgement = ack-nhfb,
keywords = "algorithms; languages",
subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
OF PROGRAMS, Studies of Program Constructs, Program and
recursion schemes. {\bf F.3.3} Theory of Computation,
Constructs, Functional constructs. {\bf F.3.2} Theory
Semantics of Programming Languages. {\bf F.3.1} Theory
Specifying and Verifying and Reasoning about Programs,
Specification techniques. {\bf F.4.2} Theory of
Grammars and Other Rewriting Systems. {\bf D.3.2}
Classifications, C.",
author = "Jonathan Finger",
title = "Lightweight Tasks in {C}",
journal = j-DDJ,
volume = "20",
number = "5",
pages = "48, 50, 102",
month = may,
year = "1995",
ISSN = "1044-789X",
bibdate = "Tue Sep 03 09:16:50 1996",
bibsource = "Compendex database;
UnCover database",
abstract = "While most modern operating systems allow multiple
threads within a process, earlier-generation systems do
not. Jonathan presents a multithreading package that
allows for cooperatively multitasked threads within a
single process for operating systems that do not
explicitly support threads.",
acknowledgement = ack-nhfb,
classification = "722.4; 723.1; 723.1.1; C6110B (Software engineering
techniques); C6150J (Operating systems)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "C; C (programming language); Codes (SYMBOLS); Computer
operating systems; Context switch; Cooperative task
switching; Cooperatively multitasked threads; DOS; High
level language; Lightweight tasker; Lightweight tasks;
Microsoft compiler; Minicomputer platform; MIX
Software; Modern operating systems; Multi-C package;
Multiple processes; Multiprocessing systems;
Multiprogramming; Multitasking system; Multithreading
code; Multithreading package; Multiuser application;
Multiuser mailing list management system; PC/DOS
system; Preemptive task switching; Program compilers;
Software engineering; Tenberry Software; Threads;
Watcom compiler",
pagecount = "2",
thesaurus = "C listings; Multiprogramming; Software portability",
author = "Stuart Fiske and William J. Dally",
title = "Thread prioritization: a thread scheduling mechanism
for multiple-context parallel processors",
journal = j-FUT-GEN-COMP-SYS,
volume = "11",
number = "6",
pages = "503--518",
month = oct,
year = "1995",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Sat Jan 10 12:00:22 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
remark = "High-Performance Computer Architecture.",
author = "Dan Ford",
title = "Event-Driven Threads In {C++}",
journal = j-DDJ,
volume = "20",
number = "6",
pages = "48--50, 52, 54, 98, 100, 102",
month = jun,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Dan presents a powerful, multithreaded architecture
that can be used by almost any application. Implemented
in C++, this class library lets you quickly create and
control threads.",
acknowledgement = ack-nhfb,
affiliation = "Hewlett--Packard",
classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J
(Object-oriented programming); C6110P (Parallel
programming); C6140D (High level languages)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "C; C (programming language); C++; Computer aided
software engineering; Computer architecture; Computer
simulation; Data structures; Equivalence classes; Event
driven threads; Hierarchical systems; Interthread
communication; Message driven thread; Multithreaded;
Multithreaded applications; Multithreading; Object
oriented programming; Object oriented programming
application; Object-oriented infrastructure; Parallel
processing; Parallelism; Synchronization;
Synchronization strategies",
pagecount = "5",
thesaurus = "C language; C listings; Object-oriented programming;
Parallel programming",
author = "Dan Ford",
title = "Event-Driven Threads In {C++}",
journal = j-DDJ,
volume = "20",
number = "6",
pages = "48--50, 52, 54, 98, 100, 102",
month = jun,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Dan presents a powerful, multithreaded architecture
that can be used by almost any application. Implemented
in C++, this class library lets you quickly create and
control threads.",
acknowledgement = ack-nhfb,
affiliation = "Hewlett--Packard",
classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J
(Object-oriented programming); C6110P (Parallel
programming); C6140D (High level languages)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "C; C (programming language); C++; Computer aided
software engineering; Computer architecture; Computer
simulation; Data structures; Equivalence classes; Event
driven threads; Hierarchical systems; Interthread
communication; Message driven thread; Multithreaded;
Multithreaded applications; Multithreading; Object
oriented programming; Object oriented programming
application; Object-oriented infrastructure; Parallel
processing; Parallelism; Synchronization;
Synchronization strategies",
pagecount = "5",
thesaurus = "C language; C listings; Object-oriented programming;
Parallel programming",
author = "Guang R. Gao and Lubomir Bic and Jean-Luc Gaudiot",
title = "Advanced topics in dataflow computing and
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "x + 450",
year = "1995",
ISBN = "0-8186-6541-6 (hardcover), 0-8186-6540-8 (paperback),
ISBN-13 = "978-0-8186-6541-7 (hardcover), 978-0-8186-6540-0
(paperback), 978-0-8186-6542-4",
LCCN = "QA76.9.A73 A356 1995",
bibdate = "Sat Apr 20 11:22:41 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "computer architecture; data structures (computer
science); parallel processing (electronic computers)",
author = "Bob Gerber",
title = "{Informix} Online {XPS}",
journal = j-SIGMOD,
volume = "24",
number = "2",
pages = "463--463",
month = may,
year = "1995",
ISSN = "0163-5808 (print), 1943-5835 (electronic)",
ISSN-L = "0163-5808",
bibdate = "Mon Jan 12 08:45:52 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110P (Parallel programming); C6150N (Distributed
systems software); C6160B (Distributed databases)",
fjournal = "ACM SIGMOD Record",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J689",
keywords = "Informix Dynamic Scalable Architecture; Informix
Extended Parallel Server; Informix Online XPS; Large
SMP systems; Light access methods; Linear performance
speedups; Loosely coupled environments; Massively
parallel clusters; Online database servers; Online/DSA
servers; Open systems spectrum; Parallel database
systems; Parallel resource management; Pipelined hash
partitioned operators; SMP based high performance
parallel data query; Table partitioning; Uniprocessor
systems; XPS; XPS multithreaded process groups",
thesaurus = "Distributed databases; File servers; Parallel
programming; Query processing",
xxcrossref = "Anonymous:1995:ASI",
author = "Milind Girkar and Constantine D. Polychronopoulos",
title = "Extracting Task-Level Parallelism",
journal = j-TOPLAS,
volume = "17",
number = "4",
pages = "600--634",
month = jul,
year = "1995",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jan 5 07:58:42 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/210189.html",
abstract = "Automatic detection of {\em task-level parallelism\/}
(also referred to as functional, DAG, unstructured, or
thread parallelism) at various levels of program
granularity is becoming increasingly important for
parallelizing and back-end compilers. Parallelizing
compilers detect iteration-level or coarser granularity
parallelism which is suitable for parallel computers;
detection of parallelism at the statement-or
operation-level is essential for most modern
microprocessors, including superscalar and VLIW
architectures. In this article we study the problem of
detecting, expressing, and optimizing task-level
parallelism, where ``task'' refers to a program
statement of arbitrary granularity. Optimizing the
amount of functional parallelism (by allowing
synchronization between arbitrary nodes) in sequential
programs requires the notion of {\em precedence\/} in
terms of paths in graphs which incorporate control and
data dependences. Precedences have been defined before
in a different context; however, the definition was
dependent on the ideas of parallel execution and time.
We show that the problem of determining precedences
statically is NP-complete. Determining precedence
relationships is useful in finding the essential data
dependences. We show that there exists a unique minimum
set of essential data dependences; finding this minimum
set is NP-hard and NP-easy. We also propose a heuristic
algorithm for finding the set of essential data
dependences. Static analysis of a program in the
Perfect Benchmarks was done, and we present some
experimental results.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "algorithms; experimentation; languages; theory",
subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
Processors, Optimization. {\bf D.3.4}: Software,
PROGRAMMING LANGUAGES, Processors, Compilers. {\bf
F.1.3}: Theory of Computation, COMPUTATION BY ABSTRACT
DEVICES, Complexity Classes, Reducibility and
completeness. {\bf D.3.4}: Software, PROGRAMMING
LANGUAGES, Processors, Code generation.",
author = "B. Goossens and D. T. Vu",
title = "Further Pipelining and Multithreading to Improve
{RISC} Processor Speed. {A} Proposed Architecture and
Simulation Results",
journal = j-LECT-NOTES-COMP-SCI,
volume = "964",
pages = "326--??",
year = "1995",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat May 11 13:45:32 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Manu Gulati",
title = "Multithreading on a superscalar microprocessor",
type = "Thesis ({M.S., Engineering})",
school = "University of California, Irvine",
address = "Irvine, CA, USA",
pages = "x + 102",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Matthew Haines and Piyush Mehrotra and David Cronk",
title = "Ropes, support for collective operations among
distributed threads",
number = "198157",
publisher = pub-NTIS,
address = pub-NTIS:adr,
pages = "??",
year = "1995",
LCCN = "NAS 1.26:198157 Govt Pubs",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Shipping list number 96-0037-M",
series = "NASA contractor report",
acknowledgement = ack-nhfb,
keywords = "computer system design; distributed processing;
interprocessor communication; memory (computers);
numerical control; parallel programming; threads",
author = "E. Douglas Jensen",
title = "Distributed real-time operating systems",
journal = j-DDJ,
volume = "20",
number = "2",
pages = "32--34, 36, 38",
month = feb,
year = "1995",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 08:45:36 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
classification = "C6150N (Distributed systems software)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "Distributed objects; Distributed operating systems;
Operating systems; Real-time computing; Real-time
operating systems; Real-time paradigm; Threads",
thesaurus = "Network operating systems; Real-time systems",
author = "Krishna M. Kavi and A. R. Hurson and Phenil Patadia
and Elizabeth Abraham and Ponnarasu Shanmugam",
title = "Design of cache memories for multi-threaded dataflow
journal = j-COMP-ARCH-NEWS,
volume = "23",
number = "2",
pages = "253--264",
month = may,
year = "1995",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:47 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "S.-I. Kawamoto and T. Ito",
title = "Multi-threaded {PaiLisp} with Granularity Adaptive
Parallel Execution",
journal = j-LECT-NOTES-COMP-SCI,
volume = "907",
pages = "94--??",
year = "1995",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat May 11 13:45:32 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Steve Kleiman and Joe Eykholt",
title = "Interrupts as threads",
journal = j-OPER-SYS-REV,
volume = "29",
number = "2",
pages = "21--26",
month = apr,
year = "1995",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:41 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Steve Kleiman and Devang Shah and Bart Smaalders",
title = "Programming With Threads",
publisher = pub-SUNSOFT,
address = pub-SUNSOFT:adr,
pages = "xxviii and 534",
year = "1995",
ISBN = "0-13-172389-8",
ISBN-13 = "978-0-13-172389-4",
LCCN = "QA76.58.K59 1996",
bibdate = "Wed Dec 09 12:51:22 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$48.00",
URL = "http://www.amazon.com/exec/obidos/ISBN=0131723898/sunworldonlineA/002-4892305-5599452",
acknowledgement = ack-nhfb,
author = "Richard B. Lam",
title = "Cross-platform communication classes",
journal = j-DDJ,
volume = "20",
number = "3",
pages = "20, 22, 24, 26",
month = mar,
year = "1995",
ISSN = "1044-789X",
bibdate = "Tue Sep 10 08:45:36 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
UnCover database",
abstract = "Richard summarizes common techniques for interprocess
communication, presenting a library that implements
semaphores in a platform-independent manner to allow
signaling or controlling of shared resources between
processes and threads.",
acknowledgement = ack-nhfb,
classification = "C5620L (Local area networks); C6110J
(Object-oriented programming); C6140D (High level
languages); C6150N (Distributed systems software)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "AIX; C++ libraries; Client/server computing; Cross
platform C++ libraries; Cross-platform communication
classes; Example library; Graphical user interfaces;
Interprocess communications; OS/2; Semaphores; Shared
resources; Windows NT",
thesaurus = "C language; Client-server systems; Object-oriented
languages; Object-oriented programming; Resource
allocation; Software libraries",
author = "J.-M. Larchev{\^e}que",
title = "Optimal Incremental Parsing",
journal = j-TOPLAS,
volume = "17",
number = "1",
pages = "1--15",
month = jan,
year = "1995",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jan 5 07:58:42 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/200996.html",
abstract = "This communication sets the problem of incremental
parsing in the context of a complete incremental
compiling system. It turns out that, according to the
incrementally paradigm of the attribute evaluator and
data-flow analyzer to be used, two definitions of
optimal incrementality in a parser are possible.
Algorithms for achieving both forms of optimality are
given, both of them based on ordinary LALR(1) parse
tables. Optimality and correctness proofs, which are
merely outlined in this communication, are made
intuitive thanks to the concept of a {\em well-formed
list of threaded trees}, a natural extension of the
concept of {\em threaded tree\/} found in earlier works
on incremental parsing.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "algorithms; performance; theory",
subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
Processors, Parsing. {\bf D.2.6}: Software, SOFTWARE
ENGINEERING, Programming Environments, Interactive.
{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
Processors, Compilers. {\bf E.1}: Data, DATA
author = "C. Lenatti",
title = "{Rethinking in Parallel: Multiprocessing is on the
rise, despite a dearth of tools to help create
multithreaded applications}",
volume = "12",
number = "8",
pages = "57--??",
year = "1995",
ISSN = "1072-4044",
bibdate = "Fri Jan 26 17:24:01 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "UnixWorld's Open Computing",
author = "Ville Lepp{\"a}nen",
title = "Performance of work-optimal {PRAM} simulation
algorithms on coated meshes",
journal = j-COMP-J,
volume = "38",
number = "10",
pages = "801--810",
month = "????",
year = "1995",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Wed Jul 21 09:54:40 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.body.html#AbstractLeppanen",
acknowledgement = ack-nhfb,
author-1-adr = "Department of Computer Science, University of Turku,
Lemmink{\"a}isenkatu 14-18, Datacity, FIN-20520 Turku,
classcodes = "C5220P (Parallel architecture); C7430 (Computer
engineering); C5320G (Semiconductor storage); C6110P
(Parallel programming); C4240C (Computational
corpsource = "Dept. of Comput. Sci., Turku Univ., Finland",
email-1 = "Ville.Leppanen@cs.utu.fi",
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
keywords = "architectures; coated meshes; combining queues method;
computational complexity; cost; greedy routing; mesh
connected routing machinery; multithreading level;
parallel; parallel algorithms; random-access storage;
routing steps; simulated PRAM processors; simulation;
sorting; synchronization wave; virtual leveled network
technique; virtual machines; work optimal PRAM
simulation algorithms",
treatment = "P Practical",
author = "Beng-Hong Lim and Ricardo Bianchini",
title = "Limits on the performance benefits of multithreading
and prefetching",
type = "Research report",
number = "RC 20238 (89547)",
institution = "IBM T. J. Watson Research Center",
address = "Yorktown Heights, NY, USA",
pages = "23",
day = "20",
month = oct,
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "Supported in part by ARPA. Supported in part by NSF
Experimental Systems. Supported in part by a NSF
Presidential Young Investigator Award",
keywords = "Cache memory; Fault-tolerant computing;
author = "Matias Loikkanen",
title = "A fine-grain multithreading superscalar architecture",
type = "Thesis ({M.S., Engineering})",
school = "University of California, Irvine",
address = "Irvine, CA, USA",
pages = "xi + 103",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Howard J. (Howard Jason) Lu",
title = "Heterogeneous multithreaded computing",
type = "Thesis ({M. Eng.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "21",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "O. C. Maquelin and H. H. J. Hum and G. R. Gao",
title = "Costs and Benefits of Multithreading with
Off-the-Shelf {RISC} Processors",
journal = j-LECT-NOTES-COMP-SCI,
volume = "966",
pages = "117--??",
year = "1995",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat May 11 13:45:32 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "T. A. Marsland and Yaoqing Gao and Francis Chi-Moon
title = "A study of software multithreading in distributed
type = "Technical report",
number = "TR 95-23",
institution = "Dept. of Computing Science, University of Alberta",
address = "Edmonton, AB, Canada",
pages = "25",
year = "1995",
ISSN = "0316-4683",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "K. R. Mayes and S. Quick and B. C. Warboys",
title = "User-level threads on a general hardware interface",
journal = j-OPER-SYS-REV,
volume = "29",
number = "4",
pages = "57--62",
month = oct,
year = "1995",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:52 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "David Metz",
title = "Interface design and system impact analysis of a
message-handling processor for fine-grain
type = "Thesis ({M.S.})",
school = "Oregon State University",
address = "Corvallis, OR, USA",
pages = "63",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors; Parallel processing (Electronic
author = "Robert C. (Robert Chisolm) Miller",
title = "A type-checking preprocessor for {Cilk 2}, a
multithreaded {C} language",
type = "Thesis ({M. Eng.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "38",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Simon W. Moore",
title = "Multithreaded processor design",
type = "Thesis ({Ph.D.})",
school = "University of Cambridge, Computer Laboratory",
address = "Cambridge, Cambridgeshire, UK",
pages = "xvi + 125",
month = feb,
year = "1995",
LCCN = "QA76.9.A73 M66 1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Available as Technical Report 358.",
abstract = "Multithreaded processors aim to improve upon both
control-flow and data-flow processor models by forming
some amalgam of the two. They combine sequential
behaviour from the control-flow model with concurrent
aspects from data-flow design. Some multithreaded
processor designs have added just a little concurrency
to control-flow or limited sequential execution to
data-flow. This thesis demonstrates that more
significant benefits may be obtained by a more radical
amalgamation of the two models. A data-driven
microthread model is proposed, where a microthread is a
short control-flow code sequence. To demonstrate the
efficiency of this model, a suitable multithreaded
processor, called Anaconda, is designed and evaluated.
Anaconda incorporates a scalable temporally predictable
memory tree structure with distributed virtual address
translation and memory protection. A temporally
predictable cached direct-mapped matching store is
provided to synchronise data to microthreads. Code is
prefetched into an instruction cache before execution
commences. Earliest-deadline-first or fixed-priority
scheduling is supported via a novel hardware priority
queue. Control-flow execution is performed by a
modified Alpha 21064 styled pipeline which assists
comparison with commercial processors.",
acknowledgement = ack-nhfb,
annote = "Supported in part by a studentship from the UK Science
and Engineering Research Council",
keywords = "Computer architecture",
author = "Shuichi Oikawa and Hideyuki Tokuda",
title = "Reflection of developing user-level real-time thread
journal = j-OPER-SYS-REV,
volume = "29",
number = "4",
pages = "63--76",
month = oct,
year = "1995",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:52 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Ernest N. Prabhakar",
title = "Implementing Distributed Objects",
journal = j-DDJ,
volume = "20",
number = "8",
pages = "80, 82, 84--85, 105--106",
month = aug,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Ernest uses NeXT's PDO and Objective-C to implement a
simple client-server application that packages a legacy
application into an interoperable object and its
acknowledgement = ack-nhfb,
affiliation = "NextStep\slash OpenStep User Groups Int",
classification = "722.1; 722.2; 722.3; 722.4; 723.1; C5620L (Local
area networks); C6110J (Object-oriented programming);
C6110P (Parallel programming); C6140D (High level
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "Codes (symbols); Computer networks; Distributed
applications; Distributed computer systems; Distributed
objects; Interfaces (COMPUTER); Interoperable object;
Interoperable objects; Legacy application;
Multithreaded object; Network protocols; NeXT; Object
oriented programming; Objective-C; PDO; Portable
distributed objects; Program compilers; Simple client
server application; Software prototyping; Storage
allocation (computer); Table lookup",
pagecount = "4",
thesaurus = "C language; C listings; Client-server systems;
Object-oriented programming; Parallel programming",
author = "Shashi Prasad",
title = "{Windows NT} Threads --- a multithreaded application
may actually run slower on an {SMP} machine than on its
single-threaded equivalent. {Here}'s how to avoid that",
journal = j-BYTE,
volume = "20",
number = "11",
pages = "253--??",
month = nov,
year = "1995",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Mon Aug 19 08:30:25 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "BYTE Magazine",
author = "Shashi Prasad",
title = "Weaving a Thread --- {Solaris} and {Windows NT} bring
the power, speed, and efficiency of multithreading and
symmetric multiprocessing to the desktop",
journal = j-BYTE,
volume = "20",
number = "10",
pages = "173--??",
month = oct,
year = "1995",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Mon Aug 19 08:30:21 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "BYTE Magazine",
author = "David E. Reich",
title = "Designing high-powered {OS/2 Warp} applications: the
anatomy of multithreaded programs",
publisher = pub-WILEY,
address = pub-WILEY:adr,
pages = "xxxi + 336",
year = "1995",
ISBN = "0-471-11586-X (paperback)",
ISBN-13 = "978-0-471-11586-1 (paperback)",
LCCN = "QA76.76.O63R437 1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Application software; Microcomputers -- Operating
systems; Operating systems (Computers); OS/2 Warp",
author = "Ira Rodens",
title = "Examining {Symantec C++} 7.0",
journal = j-DDJ,
volume = "20",
number = "8",
pages = "86--89, 106--107",
month = aug,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Among other features, this recent incarnation of
Symantec C++ sports a visual programming environment,
class and hierarchy editors, distributed build tools,
and support for templates, exceptions, and run-time
type identification. Compiler author Walter Bright adds
tips and techniques for optimizing C++ code.",
acknowledgement = ack-nhfb,
affiliation = "CompuServe",
classification = "722.2; 723.1; 723.1.1; 723.5; C6110J
(Object-oriented programming); C6110V (Visual
programming); C6115 (Programming support); C6130B
(Graphics techniques); C6150G (Diagnostic, testing,
debugging and evaluating systems); C6180G (Graphical
user interfaces)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "32-Bit multithreaded linker; Benchmarking; Browsers;
Build tasks; C (programming language); C++ language;
Codes (SYMBOLS); Computer programming; Distributed
build tools; DOS; Exceptions an; Express Agents; File
editors; Graphical user interfaces; Hierarchy editors;
LAN; Linker; Multiscope debugger; Program compilers;
Program debugging; Run time type identification; Run
time type identification programming environment;
Software engineering; Symantec C++ 7; Templates;
Upgraded Microsoft Foundation Classes; Visual
programming; Visual programming environment; Visual
tools; Windows 95 resources",
thesaurus = "Graphical user interfaces; Object-oriented
programming; Program debugging; Software reviews;
Software tools; Visual programming",
author = "John Rodley",
title = "Thread Programming In {UnixWare} 2.0",
journal = j-DDJ,
volume = "20",
number = "6",
pages = "56, 58--61, 102, 104",
month = jun,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "With the advent of UnixWare 2.0, threads have made
their way to the UNIX desktop. John describes how
threads are implemented and how you can take advantage
of them.",
acknowledgement = ack-nhfb,
classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel
programming); C6150J (Operating systems); C6150N
(Distributed systems software)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "Computer aided software engineering; Computer
programming; Computer simulation; Concurrency
programming; Fork; Lightweight processes;
Multiprocessing; Multiprocessing systems;
Multithreading; Object oriented programming; P1003.lc;
Parallel programming; POSIX Portable Operating Systems
Standard; Real time systems; Signal processing; Thread
programming; Thread specification; UNIX; UnixWare 2.0;
User interfaces",
pagecount = "5",
thesaurus = "Multiprocessing programs; Parallel programming; Unix",
author = "Anne Rogers and Martin C. Carlisle and John H. Reppy
and L. J. Hendren",
title = "Supporting Dynamic Data Structures on
Distributed-Memory Machines",
journal = j-TOPLAS,
volume = "17",
number = "2",
pages = "233--263",
month = mar,
year = "1995",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jan 5 07:58:42 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201065.html",
abstract = "Compiling for distributed-memory machines has been a
very active research area in recent years. Much of this
work has concentrated on programs that use arrays as
their primary data structures. To date, little work has
been done to address the problem of supporting programs
that use pointer-based dynamic data structures. The
techniques developed for supporting SPMD execution of
array-based programs rely on the fact that arrays are
statically defined and directly addressable. Recursive
data structures do not have these properties, so new
techniques must be developed. In this article, we
describe an execution model for supporting programs
that use pointer-based dynamic data structures. This
model uses a simple mechanism for migrating a thread of
control based on the layout of heap-allocated data and
introduces parallelism using a technique based on
futures and lazy task creation. We intend to exploit
this execution model using compiler analyses and
automatic parallelization techniques. We have
implemented a prototype system, which we call {\em
Olden}, that runs on the Intel iPSC/860 and the
Thinking Machines CM-5. We discuss our implementation
and report on experiments with five benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "experimentation; languages; measurement; performance",
subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES,
Processors, Run-time environments. {\bf D.1.3}:
Programming, Parallel programming. {\bf D.3.4}:
Software, PROGRAMMING LANGUAGES, Processors, Compilers.
{\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language
Constructs and Features, Data types and structures.
{\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language
Constructs and Features, Dynamic storage management.",
author = "Lucas J. Roh",
title = "Code generations, evaluations, and optimizations in
multithreaded executions",
type = "Thesis ({Ph.D.})",
school = inst-CSU,
address = inst-CSU:adr,
pages = "ix + 154",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Code generators; Computer architecture; Parallel
processing (Electronic computers)",
author = "Klaus E. Schauser and David E. Culler and Seth C.
title = "Separation constraint partitioning: a new algorithm
for partitioning non-strict programs into sequential
crossref = "ACM:1995:CRP",
pages = "259--271",
year = "1995",
bibdate = "Mon May 3 12:52:30 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p259-schauser/",
abstract = "In this paper we present substantially improved thread
partitioning algorithms for modern implicitly parallel
languages. We present a new block partitioning
algorithm, {\em separation constraint partitioning\/},
which is both more powerful and more flexible than
previous algorithms. Our algorithm is guaranteed to
derive maximal threads. We present a theoretical
framework for proving the correctness of our
partitioning approach, and we show how separation
constraint partitioning makes interprocedural
partitioning viable. We have implemented the
partitioning algorithms in an Id90 compiler for
workstations and parallel machines. Using this
experimental platform, we quantify the effectiveness of
different partitioning schemes on whole applications.",
acknowledgement = ack-nhfb,
keywords = "algorithms; experimentation; languages; theory;
subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, Parallel C. {\bf D.3.4} Software,
PROGRAMMING LANGUAGES, Processors, Compilers. {\bf
F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS
AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and
Problems, Computations on discrete structures. {\bf
F.3.3} Theory of Computation, LOGICS AND MEANINGS OF
PROGRAMS, Studies of Program Constructs.",
author = "Munira Shahnaz",
title = "Design of a multithreaded data cache for a hyperscalar
type = "Thesis ({M.S.})",
school = "Department of Electrical Engineering, Texas A\&M
address = "College Station, TX, USA",
pages = "xi + 80",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Major electrical engineering",
author = "Bhanu Shankar",
title = "The spectrum of thread implementations on hybrid
multithreaded architectures",
type = "Thesis ({Ph.D.})",
school = inst-CSU,
address = inst-CSU:adr,
pages = "xi + 176",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Parallel processing (Electronic
author = "Christopher Small and Margo Seltzer",
title = "Scheduler activations on {BSD}: sharing thread
management between kernel and application",
type = "Technical Report",
number = "31-95",
institution = "Center for Research in Computing Technology, Harvard
address = "Cambridge, MA, USA",
pages = "12",
year = "1995",
bibdate = "Tue Sep 17 07:11:15 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Ellen Spertus and William J. Dally",
title = "Evaluating the locality benefits of active messages",
journal = j-SIGPLAN,
volume = "30",
number = "8",
pages = "189--198",
month = aug,
year = "1995",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:08 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "A major challenge in fine-grained computing is
achieving locality without excessive scheduling
overhead. We built two J-Machine implementations of a
fine-grained programming model, the Berkeley Threaded
Abstract Machine. One implementation takes an active
messages approach, maintaining a scheduling hierarchy
in software in order to improve data cache performance.
Another approach relies on the J-Machine's message
queues and fast task switch, lowering the control costs
at the expense of data locality. Our analysis measures
the costs and benefits of each approach, for a variety
of programs and cache configurations. The active
messages implementation is strongest when miss
penalties are high and for the finest-grained programs.
The hardware-buffered implementation is strongest in
direct-mapped caches, where it achieves substantially
better instruction cache performance.",
acknowledgement = ack-nhfb,
affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
classification = "C6110P (Parallel programming); C6120 (File
organisation); C6150C (Compilers, interpreters and
other processors); C6150N (Distributed systems
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Active messages; Benefits; Berkeley Threaded Abstract
Machine; Cache configuration; Costs; Data cache
performance; Data locality; Direct-mapped caches; Fast
task switch; Fine-grained computing; Fine-grained
programming model; Hardware-buffered; Instruction cache
performance; J-Machine; Locality benefits; Message
queues; Miss penalties; Scheduling hierarchy;
Scheduling overhead",
thesaurus = "Cache storage; Cost-benefit analysis; Parallel
programming; Program compilers; Scheduling; Software
performance evaluation",
author = "Murali V. Srinivasan",
title = "A Methodology for Multithreaded {X} Client
journal = j-X-RESOURCE,
volume = "13",
number = "1",
pages = "181--181",
month = jan,
year = "1995",
ISBN = "1-56592-121-6",
ISBN-13 = "978-1-56592-121-4",
ISSN = "1058-5591",
bibdate = "Fri Mar 31 06:55:49 1995",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The X Resource",
author = "B. Steensgaard and E. Jul",
title = "Object and native code thread mobility among
heterogeneous computers (includes sources)",
journal = j-OPER-SYS-REV,
volume = "29",
number = "5",
pages = "68--77",
month = dec,
year = "1995",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:55 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Richard Stuckey",
title = "A fully conformant implementation of {ECMA-162}",
journal = j-ADA-USER,
volume = "16",
number = "2",
pages = "83--94",
month = jun,
year = "1995",
ISSN = "0268-652X",
bibdate = "Mon Sep 8 18:43:50 MDT 1997",
bibsource = "Compendex database;
abstract = "ICL has developed a portable implementation of the Ada
interfaces to PCTE as specified by ECMA-162. The
interfaces map the functionality required onto that
provided by the C interfaces to PCTE as specified by
ECMA-158. The process of implementing the interfaces
revealed a number of errors in the ECMA PCTE standards,
such as errors in ECMA-162 concerning the mapping of
ECMA-149 onto Ada, errors in ECMA-158 such as missing
operations or functions with incorrect parameter modes,
discrepancies between the Ada and C bindings and errors
in ECMA-149. The architecture of the interfaces and
their test harness has been designed to allow easy
porting from one PCTE implementation to another, and
also from one Ada compilation system to another; some
major constraints were imposed by the use of the C
interfaces as the underlying platform, particularly
regarding Ada's multi-threading abilities. The
advantages of using the interfaces include the benefits
of being able to implement tools in Ada instead of C;
insulation from the underlying PCTE implementation; and
the provision of facilities (e.g. call tracing) between
tools and PCTE.",
acknowledgement = ack-nhfb,
affiliation = "ICL Enterprises",
affiliationaddress = "Reading, Engl",
classification = "722.2; 723.1; 723.1.1; 723.5; 902.2; C6115
(Programming support); C6140D (High level languages)",
corpsource = "ICL Enterprises, Reading, UK",
fjournal = "Ada User",
journalabr = "Ada User J",
keywords = "Ada; Ada (programming language); Ada compilation
system; Ada interfaces; application program interfaces;
bindings; C (programming language); C interfaces; call
tracing; Codes (symbols); Computer aided software
engineering; ECMA PCTE standards; ECMA-149; ECMA-158;
ECMA-162; Errors; errors; fully conformant
implementation; incorrect parameter modes; missing
operations; multi-threading abilities; Portable Common
Tools Environment; portable implementation; programming
environments; software portability; software standards;
software tools; Standards; test harness; User
pubcountry = "Netherlands",
treatment = "P Practical",
author = "{SunSoft}",
title = "{Solaris} multithreaded programming guide",
publisher = pub-SUNSOFT,
address = pub-SUNSOFT:adr,
pages = "xviii + 158",
year = "1995",
ISBN = "0-13-160896-7",
ISBN-13 = "978-0-13-160896-2",
LCCN = "QA76.76.O63 S635 1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors; Operating systems (Computers);
Solaris (Computer file); UNIX (Computer file)",
author = "Doug Tamasanis",
title = "{Mathematica} meets {Warp}",
journal = j-BYTE,
volume = "20",
number = "5",
month = may,
year = "1995",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Fri May 24 09:57:14 MDT 1996",
bibsource = "Compendex database;
abstract = "Wolfram Research has ported Mathematica, the software
tool for quantitative analysis, from its Macintosh
origins to a wide range of platforms, including PCs,
Unix workstations, and several larger systems. The
latest port of Mathematica 2.2 is to OS/2 Warp. Now
OS/2 users do not have to rely on the Windows version
of the Mathematica kernel, which only simulates
multithreading. The new release takes full advantage of
the OS/2 preemptive scheduler, threading, and 32-bit
flat memory structure to both improve performance and
to greatly increase the size of the problems
Mathematica can handle. The OS/2 version is found
faster and more stable than the Windows version.",
acknowledgement = ack-nhfb,
affiliation = "BYTE",
classification = "722.2; 723.1; 723.1.1; 723.2; 723.5",
fjournal = "BYTE Magazine",
journalabr = "Byte",
keywords = "C (programming language); Command line interface;
Computer aided software engineering; Computer
architecture; Computer operating systems; Computer
simulation; Computer software; File editors; FORTRAN
(programming language); Graphical user interfaces;
Network protocols; Performance; Software Package
Mathematica; Word processing",
pagecount = "3",
author = "Richard N. Taylor and Kari A. Nies and Gregory Alan
Bolcer and Craig A. MacFarlane and Kenneth M. Anderson
and Gregory F. Johnson",
title = "Chiron-1: a software architecture for user interface
development, maintenance, and run-time support",
journal = j-TOCHI,
volume = "2",
number = "2",
pages = "105--144",
month = jun,
year = "1995",
ISSN = "1073-0516 (print), 1557-7325 (electronic)",
ISSN-L = "1073-0516",
bibdate = "Tue Jan 19 05:49:17 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tochi/;
URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1995-2-2/p105-taylor/",
abstract = "The Chiron-1 user interface system demonstrates key
techniques that enable a strict separation of an
application from its user interface. These techniques
include separating the control-flow aspects of the
application and user interface: they are concurrent and
may contain many threads. Chiron also separates
windowing and look-and-feel issues from dialogue and
abstract presentation decisions via mechanisms
employing a client-server architecture. To separate
application code from user interface code, user
interface agents called {\em artists\/} are attached to
instances of application abstract data types (ADTs).
Operations on ADTs within the application implicitly
trigger user interface activities within the artists.
Multiple artists can be attached to ADTs, providing
multiple views and alternative forms of access and
manipulation by either a single user or by multiple
users. Each artist and the application run in separate
threads of control. Artists maintain the user interface
by making remote calls to an abstract depiction
hierarchy in the Chiron server, insulting the user
interface code from the specifics of particular
windowing systems and toolkits. The Chiron server and
clients execute in separate processes. The
client-server architecture also supports multilingual
systems: mechanisms are demonstrated that support
clients written in programming languages other than
that of the server while nevertheless supporting
object-oriented server concepts. The system has been
used in several universities and research and
development projects. It is available by anonymous
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer-Human Interaction",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756",
keywords = "design; languages",
subject = "{\bf H.5.2} Information Systems, INFORMATION
interface management systems (UIMS). {\bf D.2.2}
Software, SOFTWARE ENGINEERING, Design Tools and
Techniques, User interfaces. {\bf D.2.m} Software,
SOFTWARE ENGINEERING, Miscellaneous, Reusable
author = "Radhika Thekkath",
title = "Design and performance of multithreaded
type = "Thesis ({Ph.D.})",
school = "University of Washington",
address = "Seattle, WA, USA",
pages = "x + 100",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Multiprocessors",
author = "Khushroo Rustom Todiwala",
title = "A distributed ray tracing implementation using
multithreaded {RPC}",
type = "Thesis ({M.S.})",
number = "4691",
school = "University of Texas at El Paso",
address = "El Paso, TX, USA",
pages = "xi + 140",
year = "1995",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Master's thesis / University of Texas at El Paso",
acknowledgement = ack-nhfb,
keywords = "Electronic data processing -- Distributed processing",
author = "Michel Toulouse and Teodor Gabriel Crainic and Michel
title = "Communication issues in designing cooperative
multi-thread parallel searches",
type = "Report",
number = "CRT-95-47",
institution = "Centre de recherche sur les transports, Universit{\'e}
de Montr{\'e}al",
address = "Montr{\'e}al, Qu{\'e}bec, Canada",
year = "1995",
bibdate = "Sat Apr 20 11:20:32 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Dean M. Tullsen and Susan J. Eggers and Henry M.
title = "Simultaneous multithreading: maximizing on-chip
journal = j-COMP-ARCH-NEWS,
volume = "23",
number = "2",
pages = "392--403",
month = may,
year = "1995",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:47 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "According to Hennessy and Patterson, Computer
Architecture, 6th edition, online appendix M
``Historical Perspectives and References'', page M-36,
this paper's authors ``provided the first realistic
simulation assessment and coined the term {\em
simultaneous multithreading}.''",
author = "Arthur {van Hoff}",
title = "{Java} and {Internet} Programming",
journal = j-DDJ,
volume = "20",
number = "8",
pages = "56, 58, 60--61, 101--102",
month = aug,
year = "1995",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
URL = "http://www.ddj.com/ddj/issues/j508a.htm",
abstract = "Java, a language designed for Internet development, is
an object-oriented, multithreaded, portable, dynamic
language that's similar to C, yet simpler than C++.",
abstract2 = "In 1990, a new language called `Java' was developed
which, it turns out, addresses many of the issues of
software distribution on the Internet. Java is a
simple, object-oriented, multi-threaded,
garbage-collected, secure, robust,
architecture-neutral, portable, high-performance,
dynamic language. The language is similar to C and C++
but much simpler. Java programs are compiled into a
binary format that can be executed on many platforms
without recompilation. The language contains mechanisms
to verify and execute binary Java programs in a
controlled environment, protecting computer from
potential viruses and security violations.",
acknowledgement = ack-nhfb,
affiliation = "Sun Microsystems",
classification = "721.1; 722.2; 722.3; 723.1; 723.1.1; C6110J
(Object-oriented programming); C6140D (High level
languages); C6150N (Distributed systems software)",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "Architecture-neutral language; Binary format; Browser;
Bytecodes; Bytecodes, Java language; C (programming
language); Codes (symbols); Compilation; Computational
linguistics; Computer networks; Computer programming
languages; Computer software portability;
Garbage-collection; High-performance dynamic language;
Interactive programs; Interfaces (computer); Internet;
Internet programming; Java (programming language);
Multithreaded language; Multithreading; Object oriented
programming; Object-oriented language; Portable
language; Program compilers; Program interpreters;
Robust language; Secure language; Security of data;
Semantics; Software distribution; Software engineering;
Syntax; UNIX",
pagecount = "4",
thesaurus = "Complete computer programs; Internet; Object-oriented
languages; Object-oriented programming; Security of
data; Software portability",
author = "Deborah A. Wallach and Wilson C. Hsieh and Kirk L.
Johnson and M. Frans Kaashoek and William E. Weihl",
title = "Optimistic active messages: a mechanism for scheduling
communication with computation",
journal = j-SIGPLAN,
volume = "30",
number = "8",
pages = "217--226",
month = aug,
year = "1995",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:08 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Low-overhead message passing is critical to the
performance of many applications. Active messages (AMs)
reduce the software overhead for message handling:
messages are run as handlers instead of as threads,
which avoids the overhead of thread management and the
unnecessary data copying of other communication models.
Scheduling the execution of AMs is typically done by
disabling and enabling interrupts or by polling the
network. This primitive scheduling control puts severe
restrictions on the code that can be run in a message
handler. This paper describes a new software mechanism,
optimistic active messages (OAM), that eliminates these
restrictions; OAMs allow arbitrary user code to execute
in handlers, and also allow handlers to block. Despite
this gain in expressiveness, OAMs perform as well as
AMs. We used OAM as the base for a remote procedure
calling (RPC) system, Optimistic RPC (ORPC), for the
CM-5 multiprocessor; it consists of an optimized thread
package and a stub compiler that hides communication
details from the programmer. ORPC is 1.5 to 5 times
faster than traditional RPC (TRPC) for small messages
and performs as well as AMs. Applications that
primarily communicate using large data transfers or are
fairly coarse-grained perform equally well. For
applications that send many short messages, however,
the ORPC and AM implementations are up to 3 times
faster than the TRPC implementations. Using ORPC,
programmers obtain the benefits of well-proven
programming abstractions, do not have to be concerned
with communication details, and yet obtain nearly the
performance of hand-coded AM programs.",
acknowledgement = ack-nhfb,
affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
classification = "C6150N (Distributed systems software)",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Application performance; Arbitrary user code;
Blocking; CM-5 multiprocessor; Coarse-grained
applications; Communication detail hiding;
Communication scheduling; Computation scheduling;
Expressiveness; Large data transfers; Low-overhead
message passing; Message handlers; Optimistic active
messages; Optimistic remote procedure calls; Optimized
thread package; Programming abstractions; Software
overhead; Stub compiler",
thesaurus = "Message passing; Remote procedure calls; Scheduling",
author = "Stephen Walter",
title = "Put Multiprocessing Systems to Work. {II}",
journal = j-UNIX-REVIEW,
volume = "13",
number = "1",
pages = "39--??",
month = jan,
year = "1995",
ISSN = "0742-3136",
bibdate = "Sat May 25 07:59:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
UnCover library database",
abstract = "Programming for multiprocessors requires use of
unusual features such as spin locks, mutex locks,
barrier synchronization, and the like. Using the POSIX
threads API helps, but the rest you have to do
acknowledgement = ack-nhfb,
fjournal = "UNIX review",
author = "Peter Wayner",
title = "Free Agents: a new generation of light-weight,
multithreaded operating environments provide security
and interoperability for agent developers",
journal = j-BYTE,
volume = "20",
number = "3",
pages = "105--??",
month = mar,
year = "1995",
ISSN = "0360-5280 (print), 1082-7838 (electronic)",
ISSN-L = "0360-5280",
bibdate = "Tue Jan 2 10:01:41 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "BYTE Magazine",
author = "Michael Yam",
title = "A {C++} Framework for {DCE} Threads",
journal = j-DDJ,
volume = "20",
type = "SB",
number = "??",
pages = "27--??",
month = jul # "\slash " # aug,
year = "1995",
ISSN = "1044-789X",
bibdate = "Mon Sep 2 09:09:39 MDT 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "M. Yasrebi",
title = "Experience with Distributed Objects in a Portable and
Multithreaded Library for a {LAN\slash WAN} Gateway
crossref = "IEEE:1995:PCL",
volume = "20",
pages = "164--173",
year = "1995",
bibdate = "Mon Sep 27 14:16:06 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "Also known as LCN'95. IEEE Cat no 95TB100005",
keywords = "computer communications; IEEE; LCN; local computer
author = "Gary Aitken",
title = "Moving from {C++} to {Java}",
journal = j-DDJ,
volume = "21",
number = "3",
pages = "52, 54--56",
month = mar,
year = "1996",
ISSN = "1044-789X",
bibdate = "Thu Jan 9 09:35:43 MST 1997",
bibsource = "Compendex database;
UnCover database",
abstract = "Java is claimed to be much easier to learn than C++,
but the difficulties most people have in learning to
program in both C++ and Java have little to do with
language itself. This paper explores some of the
differences between Java and C++. The aim is to make
user aware of potential problems and opportunities when
moving from C++ to Java. Brief explanations are
provided for those concepts that until now unfamiliar
for many users.",
acknowledgement = ack-nhfb,
affiliation = "Integrated Computer Solutions",
classification = "721.1; 722.2; 723.1; 723.1.1; 723.2",
fjournal = "Dr. Dobb's Journal of Software Tools",
journalabr = "Dr Dobb's J Software Tools Prof Program",
keywords = "C (programming language); Character arrays; Character
sets; Data structures; File organization; Garbage
collected language; Header files; Interfaces
(COMPUTER); Java; Machine code; Member function;
Multithreading; Object oriented programming; Pointers;
Program compilers; Program interpreters; Program
processors; Program translators; Programming theory;
Software engineering; Synchronization; Virtual
pagecount = "4",
author = "Beatrice Amrhein and Oliver Gloor and Wolfgang
title = "A Case Study of Multi-Threaded {Gr{\"o}bner} Basis
crossref = "LakshmanYN:1996:IPI",
pages = "95--102",
year = "1996",
bibdate = "Thu Mar 12 08:43:16 MST 1998",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/236869/p95-amrhein/",
acknowledgement = ack-nhfb,
keywords = "algebraic computation; algorithms; experimentation;
ISSAC; performance; SIGNUM; SIGSAM; symbolic
subject = "{\bf I.1.3} Computing Methodologies, SYMBOLIC AND
ALGEBRAIC MANIPULATION, Languages and Systems,
Special-purpose algebraic systems. {\bf D.1.3}
Programming, Parallel programming. {\bf C.1.2} Computer
Systems Organization, PROCESSOR ARCHITECTURES, Multiple
Data Stream Architectures (Multiprocessors), Parallel
author = "Murali Annavaram",
title = "Blocking versus non-blocking: issues and tradeoffs in
multithreaded code execution",
type = "Thesis ({M.S.})",
school = inst-CSU,
address = inst-CSU:adr,
pages = "viii + 57",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors -- Design and construction; Parallel
processing (Electronic computers)",
author = "Anonymous",
title = "World-wide distributed system using {Java} and the
pages = "11--18",
year = "1996",
ISSN = "1082-8907",
bibdate = "Thu Dec 12 06:31:53 MST 1996",
bibsource = "Compendex database;
note = "IEEE catalog number 96TB100069.",
acknowledgement = ack-nhfb,
affiliation = "California Inst of Technology",
affiliationaddress = "CA, USA",
classification = "716.1; 722.4; 723; 723.1; 723.1.1",
conference = "Proceedings of the 1996 5th IEEE International
Symposium on High Performance Distributed Computing",
fjournal = "IEEE International Symposium on High Performance
Distributed Computing, Proceedings",
keywords = "Collaborative environments; Computer networks;
Computer programming languages; Computer software; Data
communication systems; Distributed computer systems;
Internet; Java; Multithreaded objects; Object oriented
programming; Program composition; World wide web",
meetingaddress = "Syracuse, NY, USA",
meetingdate = "Aug 6--9 1996",
meetingdate2 = "08/06--09/96",
sponsor = "IEEE",
author = "K. Arnold and J. Gosling",
title = "Multithreaded programming in {Java}",
journal = j-WEB-TECHNIQUES,
volume = "1",
number = "7",
pages = "34--40, 42--43",
month = oct,
year = "1996",
ISSN = "1086-556X",
bibdate = "Sat Mar 15 08:49:09 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6150N (Distributed systems software); C6110J
(Object-oriented programming); C6140D (High level
languages); C6150J (Operating systems)",
fjournal = "Web Techniques",
keywords = "display; display code; dynamic behaviour; handshaking;
interactive program; interrupts; Java; Java object
oriented language; multiple; multiprogramming;
multithreaded programming; multithreaded system;
object-oriented languages; object-oriented programming;
operations; parallel programming; polling; problems;
real world software; synchronisation; threads; updates;
user input",
treatment = "P Practical",
author = "Frank Bellosa and Martin Steckermeier",
title = "The Performance Implications of Locality Information
Usage in Shared-Memory Multiprocessors",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "113--121",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0112",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C5440
(Multiprocessing systems); C5470 (Performance
evaluation and testing)",
corpsource = "Dept. of Comput. Sci. IV, Erlangen-Nurnberg Univ.,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "cache miss counters; cache storage; evaluation;
locality information; memory multiprocessors; parallel
architectures; performance; scalable shared-;
scheduling decisions; shared memory systems;
shared-memory multiprocessors; thread scheduling
treatment = "P Practical",
author = "G. D. Benson and R. A. Olsson",
title = "The design of microkernel support for the {SR}
concurrent programming language",
crossref = "Szymanski:1996:LCR",
pages = "227--240",
year = "1996",
bibdate = "Sat Sep 28 18:12:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. Sci., California Univ., Davis, CA,
classification = "C6110P (Parallel programming); C6140D (High level
languages); C6150J (Operating systems); C6150N
(Distributed systems software)",
keywords = "Distributed environment; Distributed operating system;
Distributed programming; Distributed programming
language; Mach microkernel; Message passing;
Microkernel; Microkernel support; Minimal kernel;
Multithreaded program; Networked operating system;
Parallel programming; SR concurrent programming
thesaurus = "Distributed processing; Message passing;
Multiprocessing programs; Network operating systems;
Operating system kernels; Parallel languages",
author = "C. Berg",
title = "How do threads work and how can {I} create a
general-purpose event?",
journal = j-DDJ,
volume = "21",
number = "11",
pages = "111--115, 126--127",
month = nov,
year = "1996",
ISSN = "1044-789X",
bibdate = "Sat Mar 15 08:49:09 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6110J (Object-oriented programming); C6140D (High
level languages); C6150J (Operating systems); C6150N
(Distributed systems software)",
corpsource = "Digital Focus, USA",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "(computers); application; application program
interfaces; applications; event; exception handling;
general-purpose event; Internet; Java; Java thread
mechanism; languages; lightweight processes;
multiprocessor architecture; multithreading; object;
object-oriented; object-oriented programming; operating
systems; oriented language; programming interface;
scheduling; synchronisation; synchronization; thread
programming; threads; web",
treatment = "P Practical",
author = "Cliff Berg",
title = "{Java Q and A}: How do Threads Work and How Can {I}
Create a General-Purpose Event?",
journal = j-DDJ,
volume = "21",
number = "11",
pages = "111--??",
day = "1",
month = nov,
year = "1996",
ISSN = "1044-789X",
bibdate = "Tue Oct 15 08:20:29 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "M. A. Bhandarkar and L. V. Kale",
title = "{MICE}: a prototype {MPI} implementation in {Converse}
crossref = "IEEE:1996:PSM",
pages = "26--31",
year = "1996",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110P (Parallel programming); C6115 (Programming
support); C6150E (General utility programs); C6150N
(Distributed systems software)",
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
keywords = "Abstract Device Interface; application program
interfaces; communication; computations; Converse
interoperable parallel programming environment; message
managers; message passing; MICE; MPI modules; MPICH;
multi-threaded MPI programs; open systems; parallel
programming; programming environments; prototype MPI
implementation; public-domain MPI implementation; PVM
interoperation; thread objects; utility programs",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed
treatment = "P Practical",
author = "Ricardo Bianchini and Beng-Hong Lim",
title = "Evaluating the Performance of Multithreading and
Prefetching in Multiprocessors",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "83--97",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0109",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C5440
(Multiprocessing systems); C5470 (Performance
evaluation and testing); C6110P (Parallel programming);
C6150N (Distributed systems software)",
corpsource = "COPPE Syst. Eng., Federal Univ. of Rio de Janeiro,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "cache; memory latency; MIT Alewife multiprocessor;
multiprocessing systems; multiprocessors;
multithreading; parallel; parallel architectures;
performance evaluation; programming; software
prefetching; storage management",
treatment = "P Practical",
author = "Robert D. Blumofe and Christopher F. Joerg and Bradley
C. Kuszmaul and Charles E. Leiserson and Keith H.
Randall and Yuli Zhou",
title = "{Cilk}: An Efficient Multithreaded Runtime System",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "55--69",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0107",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production;
acknowledgement = ack-nhfb,
classification = "C4240P (Parallel programming and algorithm theory);
C6110P (Parallel programming)",
corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "Cilk; critical path analysis; critical-path length;
directed acyclic graph; load balancing; multithreaded
runtime system; parallel; parallel algorithms; parallel
programming; processor scheduling; programming; runtime
scheduling; synchronisation",
treatment = "P Practical; T Theoretical or Mathematical",
author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and
Wolfgang K{\"u}chlin",
title = "Strategy Compliant Multi-Threaded Term Completion",
journal = j-J-SYMBOLIC-COMP,
volume = "21",
number = "4/5/6",
pages = "475--506 (or 475--505??)",
month = apr # ", " # may # " \& " # jun,
year = "1996",
ISSN = "0747-7171 (print), 1095-855X (electronic)",
ISSN-L = "0747-7171",
MRclass = "68Q42 (68Q22 68Q40)",
MRnumber = "1 420 910",
bibdate = "Sat May 10 15:54:09 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Parallel symbolic computation.",
acknowledgement = ack-nhfb,
classcodes = "C7310 (Mathematics computing); C5440 (Multiprocessing
systems); C4210L (Formal languages and computational
linguistics); C6130 (Data handling techniques)",
corpsource = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ.,
fjournal = "Journal of Symbolic Computation",
journal-URL = "http://www.sciencedirect.com/science/journal/07477171",
keywords = "completion module AC; Knuth--Bendix completion;
parallel; parallel architectures; rewriting systems;
shared memory; strategy compliant multi-threaded term
completion; symbol manipulation; systems;
term-rewriting system PaReDuX; unfailing completion",
treatment = "A Application; P Practical",
author = "Nikos Chrisochoides",
title = "Multithreaded model for the dynamic load-balancing of
parallel adaptive {PDE} computations",
journal = j-APPL-NUM-MATH,
volume = "20",
number = "4",
pages = "349--365",
day = "3",
month = jun,
year = "1996",
ISSN = "0168-9274 (print), 1873-5460 (electronic)",
ISSN-L = "0168-9274",
bibdate = "Wed Jul 28 14:36:24 MDT 1999",
bibsource = "Compendex database;
URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/apnum/cas_sub/browse/browse.cgi?year=1996&volume=20&issue=4&aid=652",
acknowledgement = ack-nhfb,
affiliation = "Cornell Univ",
affiliationaddress = "Ithaca, NY, USA",
classification = "722.4; 723.1; 723.5; 731.1; 921.2; 921.6",
fjournal = "Applied Numerical Mathematics: Transactions of IMACS",
journal-URL = "http://www.sciencedirect.com/science/journal/01689274",
journalabr = "Appl Numer Math",
keywords = "Calculations; Codes (symbols); Computational
complexity; Computer software; Dynamic load balancing;
Load balancing algorithms; Mathematical models;
Multicomputers; Multithreaded model; Numerical methods;
Parallel processing systems; Partial differential
equations; Processor workloads; Program complexity;
Program processors; Synchronization",
author = "Donald G. Drake",
title = "Introduction to {Java} threads",
journal = j-JAVAWORLD,
volume = "1",
number = "2",
pages = "??--??",
month = apr,
year = "1996",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-threads.htm",
acknowledgement = ack-nhfb,
author = "Richard J. Eickemeyer and Ross E. Johnson and Steven
R. Kunkel and Mark S. Squillante and Shiafun Liu",
title = "Evaluation of multithreaded uniprocessors for
commercial application environments",
journal = j-COMP-ARCH-NEWS,
volume = "24",
number = "2",
pages = "203--212",
month = may,
year = "1996",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:47 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Dean Engelhardt and Andrew Wendelborn",
title = "A Partitioning-Independent Paradigm for Nested Data
journal = j-INT-J-PARALLEL-PROG,
volume = "24",
number = "4",
pages = "291--317",
month = aug,
year = "1996",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Apr 26 11:36:49 MDT 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Univ of Adelaide",
affiliationaddress = "Aust",
classification = "721.1; 722.4; 723.1.1; 723.2; 723.5; C6110P
(Parallel programming); C6120 (File organisation);
C6150C (Compilers, interpreters and other processors);
C6150N (Distributed systems software)",
corpsource = "Dept. of Comput. Sci., Adelaide Univ., SA, Australia",
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
journalabr = "Int J Parallel Program",
keywords = "abstract machine; Computational methods; Computer
simulation; costs; data parallel model; data
partitioning; Data structures; data structures; High
level languages; irregular data structures; Multi
threading; multinode execution model; Multiprocessing
systems; multiprocessing systems; multiprocessor
machines; nested data parallelism; Nested data
parallelism; nested data structures; nodal
multi-threading; one-dimensional data parallel
operator; parallel computation; Parallel execution
models; Parallel processing systems; parallel
programming; partitioning-independent paradigm;
Performance; performance statistics; program compilers;
software performance evaluation; Thinking machines;
Thinking Machines CM-5",
treatment = "P Practical",
author = "Dino Esposito",
title = "Multithreading and {Visual Basic}",
journal = j-DDJ,
volume = "21",
number = "12",
pages = "46--??",
month = dec,
year = "1996",
ISSN = "1044-789X",
bibdate = "Sat Mar 07 08:22:15 1998",
bibsource = "http://www.ddj.com/index/author/index.htm;
abstract = "Although Visual Basic does not support native
multithreading, it does support the Windows API. This
means you can write VB applications composed of two or
more threads. Dino shows you how to create
multithreaded applications using both the SDK and
Visual Basic",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Philipp Farber",
title = "Execution architecture of the multithreaded {ADAM}
type = "Thesis ({doctoral})",
number = "13",
school = "Swiss Federal Institute of Technology",
address = "Zurich, Switzerland",
pages = "iv + 127",
year = "1996",
ISBN = "3-7281-2384-6",
ISBN-13 = "978-3-7281-2384-8",
LCCN = "????",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "TIK-Schriftenreihe",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Parallel processing (Electronic
computers); Parallel programming (Computer science)",
author = "A. Farcy and O. Temam",
title = "Improving Single-Process Performance with
Multithreaded Processors",
crossref = "ACM:1996:FCP",
pages = "350--357",
year = "1996",
bibdate = "Wed Mar 18 12:33:18 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "Also known as ICS'96. Held as part of the Federated
computing research conference (FCRC'96)",
keywords = "ACM; architecture; computer; FCRC; ICS; SIGARCH;
author = "P. Fatouron and P. Spirakis",
title = "Scheduling Algorithms for Strict Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1178",
pages = "407--??",
year = "1996",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Aug 22 11:59:49 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "E. Feuerstein and A. S. {De Loma}",
title = "On Multi-threaded Paging",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1178",
pages = "417--??",
year = "1996",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Aug 22 11:59:49 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "I. Foster and J. Geisler and S. Tuecke",
title = "{MPI} on the {I-WAY}: a wide-area, multimethod
implementation of the {Message Passing Interface}",
crossref = "IEEE:1996:PSM",
pages = "10--17",
year = "1996",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C5620W (Other computer networks); C6110B (Software
engineering techniques); C6115 (Programming support);
C6130S (Data security); C6150E (General utility
programs); C6150N (Distributed systems software)",
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Argonne Nat. Lab., IL, USA",
keywords = "application program interfaces; authentication;
automatic configuration mechanisms; communication
mechanisms; geographically distributed computing
resources; geographically distributed database
resources; geographically distributed graphics
resources; geographically distributed networking;
heterogeneous systems; high-speed wide-area networks;
I-WAY distributed- computing experiment; message
authentication; message passing; Message Passing
Interface; MPICH; Nexus multithreaded runtime system;
parallel programming; portable high-performance
programming model; process creation; programming
environments; software environment; software libraries;
utility programs; wide area networks",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed
treatment = "P Practical",
author = "Ian Foster and Carl Kesselman and Steven Tuecke",
title = "The {Nexus} Approach to Integrating Multithreading and
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "70--82",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0108",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production;
acknowledgement = ack-nhfb,
classification = "C6110P (Parallel programming); C6150C (Compilers,
interpreters and other processors); C6150N (Distributed
systems software)",
corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "asynchronous messaging; client-server systems;
compiler target; data communication; distributed;
distributed-memory systems; dynamic; dynamic
communication; global memory model; global pointer;
mechanism; memory systems; message passing;
multithreading; Nexus runtime system; parallel
languages; parallel programming; program compilers;
remote service request; synchronisation; thread
treatment = "P Practical",
author = "Seth Copen Goldstein and Klaus Erik Schauser and David
E. Culler",
title = "Lazy Threads: Implementing a Fast Parallel Call",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "5--20",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0104",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production;
acknowledgement = ack-nhfb,
classification = "C4240P (Parallel programming and algorithm theory);
C6120 (File organisation)",
corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "code generation strategy; lazy threads; multithreaded
execution models; parallel call; parallel programming;
parallel-ready sequential call; storage management",
treatment = "T Theoretical or Mathematical",
author = "Sreenivas Gollapudi",
title = "A multithreaded client-server architecture for
distributed multimedia systems",
type = "Thesis ({M.S.})",
school = "Dept. of Computer Science, State University of New
York at Buffalo",
address = "Buffalo, NY, USA",
pages = "viii + 72",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Also available as technical report 96-13.",
acknowledgement = ack-nhfb,
keywords = "Electronic data processing -- Distributed processing;
Multimedia systems -- Design and construction;
Multitasking (Computer science)",
author = "Dirk Grunwald and Richard Neves",
title = "Whole-Program Optimization for Time and Space
Efficient Threads",
journal = j-SIGPLAN,
volume = "31",
number = "9",
pages = "50--59",
month = sep,
year = "1996",
ISBN = "0-89791-767-7",
ISBN-13 = "978-0-89791-767-4",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat May 1 15:50:57 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
note = "Co-published as SIGOPS Operating Systems Review {\bf
30}(5), December 1996, and as SIGARCH Computer
Architecture News, {\bf 24}(special issue), October
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p50-grunwald/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "algorithms; design; languages; performance",
subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
Processors, Optimization. {\bf C.1.2} Computer Systems
Organization, PROCESSOR ARCHITECTURES, Multiple Data
Stream Architectures (Multiprocessors), Parallel
processors**. {\bf D.1.3} Software, PROGRAMMING
TECHNIQUES, Concurrent Programming, Parallel
author = "Marc A. Hamilton",
title = "{Java} and the Shift to Net-Centric Computing",
journal = j-COMPUTER,
volume = "29",
number = "8",
pages = "31--39",
month = aug,
year = "1996",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Sat Mar 15 08:49:09 MST 1997",
bibsource = "Compendex database;
UnCover library database",
note = "Mentions Java's use of Unicode characters.",
abstract = "Java, with its write once, run anywhere model, changes
the basic techniques by which software is designed,
developed, and deployed.",
acknowledgement = ack-nhfb,
affiliation = "Sun Microsystems",
affiliationaddress = "El Segundo, CA, USA",
classcodes = "C6140D (High level languages); C6110J (Object-oriented
programming); C7210 (Information services and centres);
C6120 (File organisation)",
classification = "722.1; 722.3; 723; 723.1; 723.1.1; 723.2; 723.3;
723.5; C6110J (Object-oriented programming); C6120
(File organisation); C6140D (High level languages);
C7210 (Information services and centres)",
corpsource = "Sun Microsyst., El Segundo, CA, USA",
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
journalabr = "Computer",
keywords = "application program interfaces; application
programming; C; C (programming language); C++; computer
aided software; Computer architecture; Computer
hardware; Computer networks; Computer operating
systems; Computer programming languages; Computer
simulation; Computer software; Computer software
portability; Distributed database systems; Dynamic
linking; engineering; environments; garbage collection;
interfaces; Internet; Internet, Object oriented
programming; interpreted language; Java; Java
programming language; language; management; Memory
management; Middleware; Middleware, Computer
programming languages; multithreading; Multithreading;
multithreading; Multithreading; multithreading; Net
centric computing; net-centric computing; Network
centric computing; Numeric data types; Object oriented
programming; object-; object-oriented languages;
object-oriented programming; oriented programming;
program compiler; Program compilers; program debugging;
Program interpreters; program testing; programming
environments; Security of data; software development;
Software engineering; software-development life cycle;
storage; Storage allocation (computer); Virtual
machines; Web browser; Web browsers; World Wide Web",
treatment = "P Practical",
author = "D. P. Helmbold and C. E. McDowell",
title = "A Taxonomy of Race Conditions",
journal = j-J-PAR-DIST-COMP,
volume = "33",
number = "2",
pages = "159--164",
day = "15",
month = mar,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0034",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:59 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production;
acknowledgement = ack-nhfb,
classification = "C4230 (Switching theory); C4240P (Parallel
programming and algorithm theory); C6110P (Parallel
corpsource = "Dept. of Comput. and Inf. Sci., California Univ.,
Santa Cruz, CA, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "access; anomalies; hazards and race conditions;
multiple threads; nondeterministic behavior; parallel
programming; race conditions taxonomy; timing",
treatment = "P Practical; T Theoretical or Mathematical",
author = "Morten Hertzum and Erik Fr{\o}kj{\ae}r",
title = "Browsing and querying in online documentation: a study
of user interfaces and the interaction process",
journal = j-TOCHI,
volume = "3",
number = "2",
pages = "136--161",
month = jun,
year = "1996",
ISSN = "1073-0516 (print), 1557-7325 (electronic)",
ISSN-L = "1073-0516",
bibdate = "Tue Jan 19 05:49:17 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tochi/;
URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1996-3-2/p136-hertzum/",
abstract = "A user interface study concerning the usage
effectiveness of selected retrieval modes was conducted
using an experimental text retrieval system, TeSS,
giving access to online documentation of certain
programming tools. Four modes of TeSS were compared:
(1) browsing, (2) conventional boolean retrieval, (3)
boolean retrieval based on Venn diagrams, and (4) these
three combined. Further, the modes of TeSS were
compared to the use of printed manuals. The subjects
observed were 87 computing new to them. In the
experiment the use of printed manuals is faster and
provides answers of higher quality than any of the
electronic modes. Therefore, claims about the
effectiveness of computer-based text retrieval have to
by vary in situations where printed manuals are
manageable to the user. Among the modes of TeSS,
browsing is the fastest and the one causing the fewest
operational errors. On the same two variables, time and
operational errors, the Venn diagram mode performs
better than conventional boolean retrieval. The
combined mode scores worst on the objective performance
measures; nonetheless nearly all subject prefer this
mode. Concerning the interaction process, the subjects
tend to manage the complexities of the information
retrieval tasks by issuing series of simple commands
and exploiting the interactive capabilities of TeSS. To
characterize the dynamics of the interaction process
two concepts are introduced; threads and sequences of
tactics. Threads in a query sequence describes the
continuity during retrieval. Sequences of tactics
concern the combined mode and describe how different
retrieval modes succeed each other as the retrieval
process evolves.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer-Human Interaction",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756",
keywords = "experimentation; human factors; performance",
subject = "{\bf H.5.2} Information Systems, INFORMATION
Evaluation/methodology. {\bf H.3.3} Information
Search and Retrieval, Query formulation. {\bf H.3.3}
Information Search and Retrieval, Retrieval models.
{\bf H.3.4} Information Systems, INFORMATION STORAGE
AND RETRIEVAL, Systems and Software. {\bf H.5.2}
PRESENTATION, User Interfaces, Training, help, and
author = "Greg Hudson",
title = "Multithreaded design in the {Athena} environment",
type = "Thesis ({M. Eng.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "240",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Herbert H. J. Hum and Olivier Maquelin and Kevin B.
Theobald and Xinmin Tian and Guang R. Gao and Laurie J.
title = "A Study of the {EARTH-MANNA} Multithreaded System",
journal = j-INT-J-PARALLEL-PROG,
volume = "24",
number = "4",
pages = "319--348",
month = aug,
year = "1996",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Apr 26 11:36:49 MDT 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Intel Corp",
affiliationaddress = "OR, USA",
classification = "722.3; 722.4; 723.5; 731.1; C5220P (Parallel
architecture); C5440 (Multiprocessing systems); C5470
(Performance evaluation and testing); C6150N
(Distributed systems software)",
corpsource = "Dept. of Meas., Archit. and Planning, Intel Corp.,
Hillsboro, OR, USA",
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
journalabr = "Int J Parallel Program",
keywords = "ASIC synchronization unit; benchmarks; Communication
latency; communication latency; Computer architecture;
Computer hardware; Computer simulation; Data
communication systems; data flow computing;
dataflow-like thread synchronizations; earth manna
system; EARTH-MANNA multithreaded system; Execution
unit; multiprocessing systems; Multiprocessing systems;
multiprocessor systems; multithreaded architecture;
Multithreaded system; off-the-shelf execution unit;
parallel architectures; Parallel processing systems;
performance; Performance; performance evaluation;
processor scheduling; Program processors; remote
requests; Scheduling; scheduling; sequentially-executed
code; synchronisation; Synchronization;
synchronization; Synchronization unit; uniprocessor
treatment = "P Practical",
author = "A. R. Hurson and Krishna M. Kavi and Behrooz Shirazi
and Ben Lee",
title = "Cache Memories for Dataflow Systems",
journal = j-IEEE-PAR-DIST-TECH,
volume = "4",
number = "4",
pages = "50--64",
month = "Winter",
year = "1996",
DOI = "https://doi.org/10.1109/88.544436",
ISSN = "1063-6552 (print), 1558-1861 (electronic)",
ISSN-L = "1063-6552",
bibdate = "Mon Jun 7 07:52:29 MDT 1999",
bibsource = "Compendex database;
URL = "http://dlib.computer.org/pd/books/pd1996/pdf/p4050.pdf;
acknowledgement = ack-nhfb,
affiliation = "Pennsylvania State Univ",
affiliationaddress = "PA, USA",
classification = "721.1; 722.1; 722.2; 723; 723.1; 731.1; C5220P
(Parallel architecture); C5320G (Semiconductor
storage); C5440 (Multiprocessing systems); C6110P
(Parallel programming); C6120 (File organisation)",
corpsource = "Dept. of Comput. Sci. and Eng., Pennsylvania State
Univ., University Park, PA, USA",
fjournal = "IEEE Parallel and Distributed Technology: Systems and
journalabr = "IEEE Parallel Distrib Technol",
keywords = "Algorithms; architectural model; Buffer storage; cache
memories; Cache misses; cache storage; Computer
architecture; computer architectures; Computer systems
programming; Context switching; control flow
architecture; control flow processing; dataflow
architectures; dataflow computation; dataflow
environment; dataflow processing; dataflow program;
dataflow programming environments; Dataflow systems;
dataflow systems; localities; Memory latencies;
Multithreading; parallel architectures; parallel
machines; Parallel processing systems; parallel
programming; Process control; Program compilers;
Program processors; Sequential switching; Storage
allocation (computer); temporal; Throughput; Virtual
treatment = "P Practical",
author = "Christopher F. (Christopher Frank) Joerg",
title = "The {Cilk} system for parallel multithreaded
type = "Thesis ({Ph.D.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "199",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Vijay Karamcheti and John Plevyak and Andrew A.
title = "Runtime Mechanisms for Efficient Dynamic
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "21--40",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0105",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production;
acknowledgement = ack-nhfb,
classification = "C4240P (Parallel programming and algorithm theory);
C5220P (Parallel architecture); C6150C (Compilers,
interpreters and other processors)",
corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "compiler; distributed memory machines; distributed
memory systems; dynamic multithreading; hybrid;
Illinois Concert runtime system; parallel; parallel
architectures; program compilers; programming; pull
messaging; stack-heap; threads",
treatment = "P Practical",
author = "Steve Kleiman and Devang Shah and Bart Smaalders",
title = "Programming with threads",
publisher = pub-PH,
address = pub-PH:adr,
pages = "xxviii + 534",
year = "1996",
ISBN = "0-13-172389-8",
ISBN-13 = "978-0-13-172389-4",
LCCN = "QA76.58 .K53 1996",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "multitasking (computer science); parallel programming
(computer science); synchronization",
author = "S. Leary",
title = "{C++} exception handling in multithreaded programs",
journal = j-C-PLUS-PLUS-REPORT,
volume = "8",
number = "2",
pages = "20--31",
month = feb,
year = "1996",
ISSN = "1040-6042",
bibdate = "Tue Mar 25 13:34:48 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6110J (Object-oriented programming); C6140D (High
level languages); C6150J (Operating systems); C6130
(Data handling techniques)",
corpsource = "Dresser-Wayne Ind., USA",
fjournal = "C++ Report",
keywords = "C language; C++; exception handling; exception-aware
thread class; exception-safe programming; lightweight
threads; multiprogramming; multitasking; multithreaded
programs; object oriented programming; object-;
object-oriented programming; operating; oriented
languages; OS/2; reusable C++ classes; software
reusability; Solaris; systems; systems (computers);
thread manager class; thread-safe reference counting
class; Windows 95; Windows NT",
treatment = "P Practical",
author = "Bil Lewis and Daniel J. Berg",
title = "Threads Primer: a Guide to Multithreaded
publisher = pub-SUNSOFT,
address = pub-SUNSOFT:adr,
pages = "xxvi + 319",
year = "1996",
ISBN = "0-13-443698-9",
ISBN-13 = "978-0-13-443698-2",
LCCN = "QA76.642 .L478 1996",
bibdate = "Fri Apr 11 17:06:46 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Sun BluePrints Program",
acknowledgement = ack-nhfb,
keywords = "POSIX (Computer software standard); Threads (Computer
programs); UNIX (Computer file)",
author = "Beng-Hong Lim and Ricardo Bianchini",
title = "Limits on the performance benefits of multithreading
and prefetching",
journal = j-SIGMETRICS,
volume = "24",
number = "1",
pages = "37--46",
month = may,
year = "1996",
CODEN = "????",
DOI = "https://doi.org/10.1145/233008.233021",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:21:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper presents new analytical models of the
performance benefits of multithreading and prefetching,
and experimental measurements of parallel applications
on the MIT Alewife multiprocessor. For the first time,
both techniques are evaluated on a real machine as
opposed to simulations. The models determine the region
in the parameter space where the techniques are most
effective, while the measurements determine the region
where the applications lie. We find that these regions
do not always overlap significantly. The multithreading
model shows that only 2-4 contexts are necessary to
maximize this technique's potential benefit in current
multiprocessors. Multithreading improves execution time
by less than 10\% for most of the applications that we
examined. The model also shows that multithreading can
significantly improve the performance of the same
applications in multiprocessors with longer latencies.
Reducing context-switch overhead is not crucial. The
software prefetching model shows that allowing 4
outstanding prefetches is sufficient to achieve most of
this technique's potential benefit on current
multiprocessors. Prefetching improves performance over
a wide range of parameters, and improves execution time
by as much as 20-50\% even on current multiprocessors.
The two models show that prefetching has a significant
advantage over multithreading for machines with low
memory latencies and/or applications with high cache
miss rates because a prefetch instruction consumes less
time than a context-switch.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "David K. Lowenthal and Vincent W. Freeh and Gregory R.
title = "Using Fine-Grain Threads and Run-Time Decision Making
in Parallel Computing",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "41--54",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0106",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production;
acknowledgement = ack-nhfb,
classification = "C5220P (Parallel architecture); C6110P (Parallel
programming)C4240P (Parallel programming and algorithm
corpsource = "Dept. of Comput. Sci., Arizona Univ., Tucson, AZ,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "computing; distributed shared memory;
distributed-memory multiprocessors; fine-grain;
fine-grain threads; parallel; parallel architectures;
parallel programming; parallelism; run-time decision
treatment = "P Practical",
author = "I. Mane",
title = "Survey of the {Java} programming language",
journal = j-ELECTRONIK,
volume = "45",
number = "17",
pages = "84--87",
day = "20",
month = "????",
year = "1996",
ISSN = "0013-5658",
bibdate = "Sat Mar 15 08:49:09 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6140D (High level languages); C6150C (Compilers,
interpreters and other processors)",
countrypub = "Germany",
fjournal = "Elektronik",
keywords = "fixed; high level languages; Java programming
language; memory partitions; multi-threading; program
compilers; source code compiler",
language = "German",
treatment = "G General Review",
author = "Weihua Mao",
title = "Performance modeling of data prefetching and
multithreading in scalable multiprocessors",
type = "Thesis ({Ph.D.})",
school = "University of Southern California",
address = "Los Angeles, CA, USA",
pages = "xi + 130",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
alttitle = "Performance modeling of data prefetching and
multithreading in scalable multiprocessors",
author = "Chuck McManis",
title = "{Java} In Depth: Synchronizing threads in {Java}",
journal = j-JAVAWORLD,
volume = "1",
number = "2",
pages = "??--??",
month = apr,
year = "1996",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-synch.htm",
acknowledgement = ack-nhfb,
author = "Chuck McManis",
title = "{Java} In Depth: Synchronizing threads in {Java},
{Part II}",
journal = j-JAVAWORLD,
volume = "1",
number = "3",
pages = "??--??",
month = may,
year = "1996",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-05-1996/jw-05-mcmanis.htm",
acknowledgement = ack-nhfb,
author = "Chuck McManis",
title = "{Java} In Depth: Threads and applets and visual
journal = j-JAVAWORLD,
volume = "1",
number = "5",
pages = "??--??",
month = jul,
year = "1996",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-07-1996/jw-07-mcmanis.htm",
acknowledgement = ack-nhfb,
author = "A. Mikschl and W. Datum",
title = "{MSparc}: a Multithreaded {Sparc}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1124",
pages = "461--??",
year = "1996",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Oct 29 14:12:39 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Amitabh Mishra",
title = "Task and instruction scheduling in parallel
multithreaded processors",
type = "Thesis ({M.S.})",
school = "Department of Computer Science, Texas A\&M
address = "College Station, TX, USA",
pages = "ix + 60",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Major computer science",
author = "John D. Mitchell",
title = "{Java} Tips: More about threads and the resize
journal = j-JAVAWORLD,
volume = "1",
number = "4",
pages = "??--??",
month = jun,
year = "1996",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/javatips/jw-javatip9.htm",
acknowledgement = ack-nhfb,
author = "Simon W. (Simon William) Moore",
title = "Multithreaded processor design",
volume = "SECS 358",
publisher = pub-KLUWER,
address = pub-KLUWER:adr,
pages = "xvi + 142",
year = "1996",
ISBN = "0-7923-9718-5",
ISBN-13 = "978-0-7923-9718-2",
LCCN = "QA76.5 .M574 1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "The Kluwer international series in engineering and
computer science",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; computer architecture;
Computers -- Design; multiprocessors -- design and
construction; Multiprocessors -- Design and
construction; Parallel computers; parallel computers",
author = "Bradford Nichols and Bick Buttlar and Jackie Proulx
title = "{Pthreads} Programming",
publisher = pub-ORA,
address = pub-ORA:adr,
pages = "xvi + 267",
year = "1996",
ISBN = "1-56592-115-1",
ISBN-13 = "978-1-56592-115-3",
LCCN = "QA76.642.N53 1996",
bibdate = "Mon May 11 11:04:53 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$29.95",
URL = "http://www.amazon.com/exec/obidos/ASIN/1565921151/ref=sim_books/002-4892305-5599452;
acknowledgement = ack-nhfb,
author = "Charles J. Northrup",
title = "Programming with {UNIX} Threads",
publisher = pub-WILEY,
address = pub-WILEY:adr,
pages = "xv + 399",
year = "1996",
ISBN = "0-471-13751-0 (paperback)",
ISBN-13 = "978-0-471-13751-1 (paperback)",
LCCN = "QA76.76.O63 N674 1996",
bibdate = "Tue May 25 07:14:38 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "operating systems (computers); UNIX (computer file)",
author = "Scott J. Norton and Mark D. DiPasquale",
title = "Thread time: the multithreaded programming guide",
publisher = pub-PH,
address = pub-PH:adr,
pages = "xx + 538",
year = "1996",
ISBN = "0-13-190067-6 (paperback)",
ISBN-13 = "978-0-13-190067-7 (paperback)",
LCCN = "QA76.642.N67 1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Hewlett--Packard professional books",
URL = "http://www.amazon.com/exec/obidos/ASIN/0131900676/ref=sim_books/002-4892305-5599452",
acknowledgement = ack-nhfb,
annote = "System requirements: IBM compatible PC; CD-ROM
keywords = "Parallel programming (Computer science)",
author = "Thuan Q. Pham and Pankaj K. Garg",
title = "Multithreaded programming with {Windows NT}",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
pages = "xviii + 227",
year = "1996",
ISBN = "0-13-120643-5",
ISBN-13 = "978-0-13-120643-4",
LCCN = "QA76.642 .P52 1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "One 3 1/2 in. diskette in pocket inside back cover.",
keywords = "Microsoft Windows NT; multiprocessors;
Multiprocessors; Parallel programming; parallel
programming (computer science); Parallel programming
(Computer science)",
author = "James Philbin and Jan Edler and Otto J. Anshus and
Craig C. Douglas and Kai Li",
title = "Thread Scheduling for Cache Locality",
journal = j-SIGPLAN,
volume = "31",
number = "9",
pages = "60--71",
month = sep,
year = "1996",
ISBN = "0-89791-767-7",
ISBN-13 = "978-0-89791-767-4",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:23 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
note = "Co-published as SIGOPS Operating Systems Review {\bf
30}(5), December 1996, and as SIGARCH Computer
Architecture News, {\bf 24}(special issue), October
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p60-philbin/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "algorithms; experimentation; performance",
subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
Processors, Optimization. {\bf I.1.2} Computing
Algorithms, Algebraic algorithms. {\bf F.2.2} Theory of
COMPLEXITY, Nonnumerical Algorithms and Problems,
Sequencing and scheduling. {\bf F.2.1} Theory of
COMPLEXITY, Numerical Algorithms and Problems,
Computations on matrices. {\bf D.2.2} Software,
SOFTWARE ENGINEERING, Design Tools and Techniques, User
author = "Kay A. Robbins and Steven Robbins",
title = "Practical {UNIX} programming: a guide to concurrency,
communication, and multithreading",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
pages = "xiv + 658",
year = "1996",
ISBN = "0-13-443706-3",
ISBN-13 = "978-0-13-443706-4",
LCCN = "QA76.76.O63 R615 1996",
bibdate = "Tue May 25 07:14:38 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Microcomputers -- Programming; Operating systems; UNIX
(Computer file)",
author = "Lucas Roh and Walid A. Najjar and Bhanu Shankar and A.
P. Wim B{\"o}hm",
title = "Generation, Optimization, and Evaluation of
Multithreaded Code",
journal = j-J-PAR-DIST-COMP,
volume = "32",
number = "2",
pages = "188--204",
day = "1",
month = feb,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0013",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:18:59 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production;
acknowledgement = ack-nhfb,
classification = "C1180 (Optimisation techniques); C4230M
(Multiprocessor interconnection); C5220P (Parallel
architecture); C6110P (Parallel programming); C6150C
(Compilers, interpreters and other processors); C6150N
(Distributed systems software)",
corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort
Collins, CO, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "architectures; code generation scheme; compiler
intermediate; form; global bottom-up optimization
technique; inputs; instruction level; intrathread
locality; latency tolerance; multiprocessor
interconnection networks; multithreaded; multithreaded
code; multithreaded code evaluation; multithreaded code
generation; multithreaded computation model;
multithreaded synchronization; optimisation; optimising
compilers; parallel; parallel architectures;
parallelising compilers; parallelism; Pebbles;
processor scheduling; processor utilization; program
level; programming; reduced instruction set computing;
scalability; synchronisation; synchronization costs;
top-down code generation",
treatment = "T Theoretical or Mathematical",
author = "David E. Ruddock and Balakrishnan Dasarathy",
title = "Multithreading Programs: Guidelines for {DCE}
journal = j-IEEE-SOFTWARE,
volume = "13",
number = "1",
pages = "80--90",
month = jan,
year = "1996",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Sat Jan 25 07:35:26 MST 1997",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Bellcore",
affiliationaddress = "Piscataway, NJ, USA",
classification = "722.2; 722.4; 723.1; 723.2; 723.3",
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
journalabr = "IEEE Software",
keywords = "Application programming interfaces; Client server
architecture; Computer aided software engineering;
Computer operating systems; Computer programming
languages; Concurrency control; Data communication
systems; Data structures; Distributed computer systems;
Distributed computing environment; Multithreading;
Network services; Remote procedure call; Security of
data; Synchronization; Telecommunication services; User
author = "A. Sah and K. Brown and E. Brewer",
title = "Programming the {Internet} from the server-side with
{Tcl} and {Audience1}",
crossref = "USENIX:1996:ATT",
pages = "235--??, 183--188",
year = "1996",
bibdate = "Sat Mar 15 08:49:09 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6150N (Distributed systems software); C6115
(Programming support); C6110 (Systems analysis and
programming); C6140D (High level languages); C7230
(Publishing and reproduction); C7250N (Front end
systems for online searching)",
conflocation = "Monterey, CA, USA; 10--13 July 1996",
conftitle = "Proceedings of 4th Annual Tcl/Tk Workshop '96",
corpsource = "Inktomi Corp., Berkeley, CA, USA",
keywords = "applications; Audience1; authoring languages;
client-server; client-server systems; client-side
languages; electronic; end-; extension library; HotBot
search engine; HotWired; Inktomi; Internet; mass
customization features; MTtcl; multi-threaded Tcl;
online front-ends; programming; publishing; server
languages; server-side Internet programming; software
libraries; to-end publishing tool; World Wide Web",
treatment = "P Practical",
author = "D. C. Schmidt and S. Vinoski",
title = "Comparing alternative programming techniques for
multithreaded servers",
journal = j-C-PLUS-PLUS-REPORT,
volume = "8",
number = "2",
pages = "50--59",
month = feb,
year = "1996",
ISSN = "1040-6042",
bibdate = "Tue Mar 25 13:34:48 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6150N (Distributed systems software); C6110J (Object-
oriented programming); C6160 (Database management
systems (DBMS)); C6140D (High level languages)",
corpsource = "Washington Univ., St. Louis, MO, USA",
fjournal = "C++ Report",
keywords = "applications; C; C language; C++; client-server
systems; CORBA; database management; desktop client;
financial data processing; investment brokers;
languages; multithreaded servers; multithreaded
systems; object-oriented; object-oriented programming;
programming; query processing; stock prices; stock
quote database; synchronization; systems; wrappers",
treatment = "P Practical",
author = "D. C. Schmidt and S. Vinoski",
title = "Comparing alternative programming techniques for
multithreaded {CORBA} servers",
journal = j-C-PLUS-PLUS-REPORT,
volume = "8",
number = "4",
pages = "56--66",
month = apr,
year = "1996",
ISSN = "1040-6042",
bibdate = "Tue Mar 25 13:34:48 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6110J (Object-oriented programming); C6110P (Parallel
programming); C6140D (High level languages)",
corpsource = "Washington Univ., St. Louis, MO, USA",
fjournal = "C++ Report",
keywords = "C language; complexity; distributed multithreaded
applications; multithreaded CORBA servers;
object-oriented programming; parallel; programming;
programming techniques",
treatment = "P Practical",
author = "D. C. Schmidt and S. Vinoski",
title = "Comparing alternative programming techniques for
multithreaded {CORBA} servers",
journal = j-C-PLUS-PLUS-REPORT,
volume = "8",
number = "7",
pages = "47--56",
month = jul,
year = "1996",
ISSN = "1040-6042",
bibdate = "Tue Mar 25 13:34:48 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6110J (Object-oriented programming); C6150N
(Distributed systems software); C5690 (Other data
communication equipment and techniques); C6110P
(Parallel programming)",
corpsource = "Washington Univ., St. Louis, MO, USA",
fjournal = "C++ Report",
keywords = "alternative programming techniques; C; C++ wrappers;
concurrency model; CORBA; multithreaded CORBA;
multithreaded stock quote servers; network servers;
object-oriented programming; parallel; programming;
servers; thread per request; thread per session model;
thread pool",
treatment = "P Practical",
author = "Charles Severance and Richard Enbody and Paul
title = "Managing the Overall Balance of Operating System
Threads on a Multiprocessor Using Automatic
Self-Allocating Threads ({ASAT})",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "106--112",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0111",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production;
acknowledgement = ack-nhfb,
classification = "C5440 (Multiprocessing systems); C6110P (Parallel
programming); C6150J (Operating systems); C6150N
(Distributed systems software)",
corpsource = "Dept. of Comput. Sci., Michigan State Univ., East
Lansing, MI, USA",
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "allocating threads; allocation; automatic self-;
multiprocessing system; multiprocessing systems;
operating system; operating systems (computers);
parallel programming; processor scheduling; run-time
environment; self-scheduling; thread; thread
treatment = "P Practical; X Experimental",
author = "U. Sigmund and T. Ungerer",
title = "Identifying Bottlenecks in a Multithreaded Superscalar
journal = j-LECT-NOTES-COMP-SCI,
volume = "1124",
pages = "797--??",
year = "1996",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Oct 29 14:12:39 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "A. Skjellum and B. Protopopov and S. Hebert",
title = "A thread taxonomy for {MPI}",
crossref = "IEEE:1996:PSM",
pages = "50--57",
year = "1996",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6110F
(Formal methods); C6150E (General utility programs);
C6150J (Operating systems); C6150N (Distributed systems
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Dept. of Comput. Sci., Mississippi State Univ., MS,
keywords = "API extensions; application program interfaces;
Channel Device; computational unit; fine-grain
concurrency; formal specification; message passing;
minimal portable thread management; MPI; MPICH;
multi-threaded thread-safe ADI; non-thread-safe MPI
call semantics; resource container; software
portability; synchronisation; synchronization
mechanisms; thread models; thread safety; thread
taxonomy; user-level mechanism; utility programs;
Windows NT version",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed
treatment = "P Practical",
author = "Neelakantan Sundaresan and Dennis Gannon",
title = "{Coir}: An Object-Oriented System for Control and
Dynamic Data Parallelism",
journal = j-J-PAR-DIST-COMP,
volume = "37",
number = "1",
pages = "98--105",
day = "25",
month = aug,
year = "1996",
DOI = "https://doi.org/10.1006/jpdc.1996.0110",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:00 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production;
acknowledgement = ack-nhfb,
classification = "C4240P (Parallel programming and algorithm theory);
C5220P (Parallel architecture); C6110J (Object-oriented
programming); C6110P (Parallel programming); C6150N
(Distributed systems software)",
corpsource = "Applic. Dev. Technol. Inst., IBM Corp., San Jose, CA,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "C++ library; Coir; distributed memory machines;
distributed memory systems; dynamic data parallelism;
message passing; message-passing; multithreading;
object-oriented; object-oriented system; operating
system; parallel; parallel architectures; parallel
programming; programming; shared memory systems;
symmetric multiprocessors; synchronisation",
treatment = "P Practical; T Theoretical or Mathematical",
author = "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer
and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm",
title = "Exploiting choice: instruction fetch and issue on an
implementable simultaneous multithreading processor",
journal = j-COMP-ARCH-NEWS,
volume = "24",
number = "2",
pages = "191--202",
month = may,
year = "1996",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:47 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Dean Michael Tullsen",
title = "Simultaneous multithreading",
type = "Thesis ({Ph.D.})",
school = "University of Washington",
address = "Seattle, WA, USA",
pages = "vi + 99",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Parallel processing (Electronic
author = "Anthony Verriello",
title = "Memory sharing in multithreaded transaction
type = "Thesis ({M.S.})",
school = "Hofstra University",
address = "Westport, CT, USA",
pages = "180",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Memory, Virtual (Computer science); Transaction
systems (Computer systems)",
author = "S. Vinoski and D. C. Schmidt",
title = "Distributed callbacks and decoupled communication in
journal = j-C-PLUS-PLUS-REPORT,
volume = "8",
number = "9",
pages = "48--56, 77",
month = oct,
year = "1996",
ISSN = "1040-6042",
bibdate = "Tue Mar 25 13:34:48 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classcodes = "C6150N (Distributed systems software); C6110J (Object-
oriented programming)",
corpsource = "Hewlett--Packard's Distributed Comput. Program,
Chelmsford, MA, USA",
fjournal = "C++ Report",
keywords = "client-server systems; client/server; concurrency
control; concurrency models; consumers; CORBA;
decoupled communication; decoupled peer-to-peer;
distributed callbacks; distributed object computing
systems; distributed stock quoting; multithreaded;
object-oriented; OMG Events object service;
programming; relationships; request communication;
response communication; server applications; suppliers;
treatment = "P Practical",
author = "V. Vlassov and L.-E. Thorelli",
title = "Analytical Models of Multithreading with Data
journal = j-LECT-NOTES-COMP-SCI,
volume = "1124",
pages = "714--??",
year = "1996",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Oct 29 14:12:39 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "David S. Wise and Joshua Walgenbach",
title = "Static and dynamic partitioning of pointers as links
and threads",
journal = j-SIGPLAN,
volume = "31",
number = "6",
pages = "42--49",
month = jun,
year = "1996",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:20 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
affiliation = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Roland Wism{\"u}ller and Michael Oberhuber and Johann
Krammer and Olav Hansen",
title = "Interactive debugging and performance analysis of
massively parallel applications",
volume = "22",
number = "3",
pages = "415--442",
day = "29",
month = apr,
year = "1996",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Fri Aug 6 10:14:54 MDT 1999",
bibsource = "Compendex database;
URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=3&aid=1049",
acknowledgement = ack-nhfb,
affiliation = "Inst f{\"u}r Informatik der Technischen
Universit{\"a}t M{\"u}nchen",
affiliationaddress = "M{\"u}nchen, Ger",
classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel
programming); C6115 (Programming support); C6150G
(Diagnostic, testing, debugging and evaluating
corpsource = "Inst. f{\"u}r Inf., Tech. Univ. M{\"u}nchen, Germany",
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
journalabr = "Parallel Comput",
keywords = "applications; attributed measurements; Codes
(symbols); Computer debugging; Computer programming;
Computer simulation; debugger; debugging; DETOP;
Distributed computer systems; distributed evaluation;
Distributed online monitoring system; environments;
Interactive computer systems; Interactive debugging;
intrusion; massively parallel; Massively parallel
applications; minimal; monitoring system; multithreaded
programming models; Online systems; parallel; Parallel
debugger; Parallel processing systems; parallel
programming; Parallelization; PATOP; Performance;
performance analysis; Performance analysis; performance
analyzer; performance bottlenecks; Personal computers;
PowerPC; program debugging; programming; scalability;
software; software performance evaluation;
Supercomputers; tools; usability; User interfaces",
treatment = "P Practical",
author = "Michael Yam",
title = "{DCE} Pthreads versus {NT} Threads. {Michael} ports
{PTF}, a {C++} class library for {DCE} pthreads, from
{HP-UX System 9} to {Windows NT}. {In} doing so, he
examines the differences between pthreads and {NT}
threads, and describes the porting experience",
journal = j-DDJ,
volume = "21",
number = "12",
pages = "16--??",
month = dec,
year = "1996",
ISSN = "1044-789X",
bibdate = "Mon Dec 2 07:52:21 MST 1996",
bibsource = "http://www.ddj.com/index/author/index.htm;
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "H. Chuck Yoo",
title = "Comparative Analysis of Asynchronous {I/O} in
Multithreaded {UNIX}",
journal = j-SPE,
volume = "26",
number = "9",
pages = "987--997",
month = sep,
year = "1996",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Thu Jul 29 15:11:03 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=16832",
acknowledgement = ack-nhfb,
fjournal = "Software --- Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
author = "Namhoon Yoo",
title = "Parallelism control in multithreaded multiprocessors",
type = "Thesis ({Ph.D.})",
school = "University of Southern California",
address = "Los Angeles, CA, USA",
pages = "x + 86",
year = "1996",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Data flow computing;
Multiprocessors; Parallel processing (Electronic
author = "Bernard Zignin",
title = "Techniques du multithread: du parall{\`e}lisme dans
les processus {(French) [Multithreading techniques:
parallelism in processes]}",
publisher = pub-HERMES,
address = pub-HERMES:adr,
pages = "72",
year = "1996",
ISBN = "2-86601-562-2",
ISBN-13 = "978-2-86601-562-6",
LCCN = "????",
bibdate = "Wed Dec 09 23:36:26 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "CNAM. Syntheses informatiques",
acknowledgement = ack-nhfb,
keywords = "Parall{\`e}lisme (informatique)",
language = "French",
author = "Anonymous",
title = "New Products: {WebThreads 1.0.1; QUERYFLEX Report
Writer; Linux Pro Desktop 1.0; NDP Fortran for Linux;
Numerics and Visualization for Java; Craftworks
Linux/AXP 2.2; InfoDock Linux Software Development
Toolset; Caldera Wabi 2.2 for Linux}",
journal = j-LINUX-J,
volume = "34",
pages = "??--??",
month = feb,
year = "1997",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Fri Oct 9 08:35:26 MDT 1998",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html;
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "Anonymous",
title = "Technology News \& Reviews: {Chemkin} software;
{OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
{Java} products; {Scientific WorkPlace 3.0}",
journal = j-IEEE-COMPUT-SCI-ENG,
volume = "4",
number = "4",
pages = "75--??",
month = oct # "\slash " # dec,
year = "1997",
ISSN = "1070-9924 (print), 1558-190X (electronic)",
ISSN-L = "1070-9924",
bibdate = "Sat Jan 9 08:57:23 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib;
URL = "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computational Science \& Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
author = "Anonymous",
title = "Tech Watch --- Pattern-recognition system. {Piecing}
together history. {3D} semiconductor simulation.
{Multi}-threaded architecture",
journal = j-CG-WORLD,
volume = "20",
number = "9",
pages = "15--??",
month = sep,
year = "1997",
ISSN = "0271-4159",
bibdate = "Sat Nov 7 10:32:27 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Computer Graphics World",
author = "Arvind and A. Caro and J.-W. Maessen and S. Aditya",
title = "A Multithreaded Substrate and Compilation Model for
the Implicitly Parallel Language {pH}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1239",
pages = "519--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Aug 22 11:59:49 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "M. Bednorz and A. Gwozdowski and K. Zieli{\'n}ski",
title = "Contextual debugging and analysis of multithreaded
journal = j-CPE,
volume = "9",
number = "2",
pages = "123--139",
month = feb,
year = "1997",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 06:06:28 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13852;
acknowledgement = ack-nhfb,
fjournal = "Concurrency, practice and experience",
author = "Jim Beveridge and Robert Wiener",
title = "Multithreading applications in {Win32}: the complete
guide to threads",
publisher = pub-AWDP,
address = pub-AWDP:adr,
pages = "xviii + 368",
year = "1997",
ISBN = "0-201-44234-5 (pb) 0-201-18385-4 (CD-ROM)",
ISBN-13 = "978-0-201-44234-2 (pb) 978-0-201-18385-6 (CD-ROM)",
LCCN = "QA76.76.O63 B478 1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "System requirements: IBM compatible PC; Win32; Windows
NT or Windows 95; CD-ROM drive.",
keywords = "Microsoft Win32; Microsoft Windows (Computer file);
Microsoft Windows NT; Operating systems (Computers)",
author = "Aart J. C. Bik and Juan E. Villacis and Dennis B.
title = "javar: a prototype {Java} restructuring compiler",
journal = j-CPE,
volume = "9",
number = "11",
pages = "1181--1191",
month = nov,
year = "1997",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 06:06:35 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "Special Issue: Java for computational science and
engineering --- simulation and modeling II.",
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13819;
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6110P
(Parallel programming); C6150C (Compilers, interpreters
and other processors)",
conflocation = "Las Vegas, NV, USA; 21 June 1997",
conftitle = "Java for Computational Science and Engineering ---
Simulation and Modeling II",
corpsource = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN,
fjournal = "Concurrency, practice and experience",
keywords = "annotations; explicit parallelism; functionality;
implicit parallelism; Java program parallelization;
Java restructuring compiler; javar; multi-threading;
object-oriented languages; parallelising compilers;
prototype; semantic analysis; software prototyping",
pubcountry = "UK",
sponsororg = "ACM",
treatment = "P Practical",
author = "Rajesh Bordawekar and Steven Landherr and Don Capps
and Mark Davis",
title = "Experimental evaluation of the {Hewlett--Packard}
{Exemplar} file system",
journal = j-SIGMETRICS,
volume = "25",
number = "3",
pages = "21--28",
month = dec,
year = "1997",
CODEN = "????",
DOI = "https://doi.org/10.1145/270900.270904",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:24:50 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This article presents results from an experimental
evaluation study of the HP Exemplar file system. Our
experiments consist of simple micro-benchmarks that
study the impact of various factors on the file system
performance. These factors include I/O request/buffer
sizes, vectored/non-vectored access patterns,
read-ahead policies, multi-threaded (temporally
irregular) requests, and architectural issues (cache
parameters, NUMA behavior, etc.). Experimental results
indicate that the Exemplar file system provides high
I/O bandwidth, both for single- and multi-threaded
applications. The buffer cache, with prioritized buffer
management and large buffer sizes, is effective in
exploiting temporal and spatial access localities. The
performance of non-contiguous accesses can be improved
by either using vectored I/O interfaces or tuning the
read-ahead facilities. The file system performance
depends on the relative locations of the computing
threads and the file system, and also on various
Exemplar design parameters such as the NUMA
architecture, TLB/data cache management and paging
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Randall Bramley",
title = "Technology News \& Reviews: {Chemkin} software;
{OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab};
{Java} products; {Scientific WorkPlace 3.0}",
journal = j-IEEE-COMPUT-SCI-ENG,
volume = "4",
number = "4",
pages = "75--78",
month = oct # "\slash " # dec,
year = "1997",
ISSN = "1070-9924 (print), 1558-190X (electronic)",
ISSN-L = "1070-9924",
bibdate = "Sat Jan 9 08:57:23 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib;
URL = "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computational Science \& Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
remark = "No DOI available: article missing from IEEE Xplore
author = "David R. Butenhof",
title = "Programming with {POSIX} threads",
publisher = pub-AW,
address = pub-AW:adr,
pages = "xviii + 381",
year = "1997",
ISBN = "0-201-63392-2",
ISBN-13 = "978-0-201-63392-4",
LCCN = "QA76.76.T55B88 1997",
bibdate = "Mon Sep 01 08:53:12 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$31.95",
URL = "http://www.amazon.com/exec/obidos/ASIN/0201633922/ref=sim_books/002-4892305-5599452",
acknowledgement = ack-nhfb,
author = "John Calcote",
title = "Thread Pools and Server Performance",
journal = j-DDJ,
volume = "22",
number = "7",
pages = "60--??",
month = jul,
year = "1997",
ISSN = "1044-789X",
bibdate = "Sat Jun 28 10:43:47 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "P. Cenciarelli and A. Knapp and B. Reus and M.
title = "From sequential to multi-threaded {Java}: An
event-based operational semantics",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1349",
pages = "75--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "P. Cenciarelli and A. Knapp and B. Reus and M.
title = "From sequential to multi-threaded {Java}: An
event-based operational semantics",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1349",
pages = "75--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Yong Dou and Zhengbing Pang and Xingming Zhou",
title = "Implementing a software virtual shared memory on
crossref = "IEEE:1997:APD",
pages = "??--??",
year = "1997",
bibdate = "Wed Apr 16 06:39:19 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110P (Parallel programming); C6115 (Programming
support); C6120 (File organisation); C6140D (High level
languages); C7430 (Computer engineering)",
corpsource = "Dept. of Comput. Sci., Changsha Inst. of Technol.,
Hunan, China",
keywords = "distributed; FORTRAN; FORTRAN language; GKD-VSM;
memory environments; multithread scheme; parallel
programming; parallel programming model; Prefetch and
Poststore; programming environments; PVM; shared
memory; software overhead; software virtual shared
memory; synchronisation; user-level; virtual machines;
virtual storage",
treatment = "P Practical",
author = "Susan J. Eggers and Joel S. Emer and Henry M. Levy and
Jack L. Lo and Rebecca L. Stamm and Dean M. Tullsen",
title = "Simultaneous Multithreading: a Platform for
Next-Generation Processors",
journal = j-IEEE-MICRO,
volume = "17",
number = "5",
pages = "12--19",
month = sep # "\slash " # oct,
year = "1997",
DOI = "https://doi.org/10.1109/40.621209",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Dec 14 06:08:58 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
Science Citation Index database (1980--2000)",
URL = "http://dlib.computer.org/mi/books/mi1997/pdf/m5012.pdf;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Richard J. Eickemeyer",
title = "Evaluation of multithreaded processors and
thread-switch policies",
type = "Research report",
number = "RC 20956 (92759)",
institution = "IBM T. J. Watson Research Center",
address = "Yorktown Heights, NY, USA",
pages = "16",
day = "18",
month = aug,
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper examines the use of coarse-grained
multithreading to lessen the negative impact of memory
access latencies on the performance of uniprocessor
on-line transaction processing systems. It considers
the effect of switching threads on cache misses in a
two-level cache system. It also examines several
different thread-switch policies. The results suggest
that multithreading with a small number (3-5) of active
threads can significantly improve the performance of
such commercial environments.",
acknowledgement = ack-nhfb,
keywords = "Cache memory; Computer architecture; Threads (Computer
author = "E. A. Emerson and A. P. Sistla",
title = "Utilizing Symmetry when Model-Checking under Fairness
Assumptions: An Automata-Theoretic Approach",
journal = j-TOPLAS,
volume = "19",
number = "4",
pages = "617--638",
month = jul,
year = "1997",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Wed Dec 3 16:28:05 MST 1997",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/journals/toplas/1997-19-4/p617-emerson/",
abstract = "One useful technique for combating the state explosion
problem is to exploit symmetry when performing temporal
logic model checking. In previous work it is shown how,
using some basic notions of group theory, symmetry may
be exploited for the full range of correctness
properties expressible in the very expressive temporal
logic CTL*. Surprisingly, while fairness properties are
readily expressible in CTL*, these methods are not
powerful enough to admit any amelioration of state
explosion, when fairness assumptions are involved. We
show that it is nonetheless possible to handle fairness
efficiently by trading some group theory for automata
theory. Our automata-theoretic approach depends on
detecting fair paths subtly encoded in a quotient
structure whose arcs are annotated with permutations,
by using a threaded structure that reflects coordinate
shifts caused by the permutations.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "design; languages; theory; verification",
subject = "{\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS
OF PROGRAMS, Specifying and Verifying and Reasoning
about Programs. {\bf F.1.1} Theory of Computation,
{\bf D.2.4} Software, SOFTWARE ENGINEERING,
Software/Program Verification.",
author = "Marco Fillo and Stephen W. Keckler and William J.
Dally and Nicholas P. Carter and Andrew Chang and
Yevgeny Gurevich and Whay S. Lee",
title = "The {M}-Machine Multicomputer",
journal = j-INT-J-PARALLEL-PROG,
volume = "25",
number = "3",
pages = "183--212",
month = jun,
year = "1997",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Tue Apr 7 18:25:25 MDT 1998",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "Massachusetts Inst of Technology",
affiliationaddress = "Cambridge, MA, USA",
classification = "714.2; 722; 722.1; 722.4; 723; 723.1",
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
journalabr = "Int J Parallel Program",
keywords = "Buffer storage; Computer architecture; Data storage
equipment; Microprocessor chips; Multiprogramming;
Multithread processors; On chip cache; Parallel
processing systems; Synchronization; Thread level
parallelism; User interfaces",
author = "Michael T. Fisher",
title = "A study of the performance of simultaneous
multithreading on a superscalar processor",
type = "Thesis ({M.S.E.E.})",
number = "2363",
school = "State University of New York at Binghamton, Watson
School of Engineering and Applied Science",
address = "Binghamton, NY, USA",
pages = "vi + 98",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Master's theses / State University of New York at
acknowledgement = ack-nhfb,
alttitle = "Simultaneous multithreading on a superscalar processor
Multithreading on a superscalar processor Superscalar
keywords = "Microprocessors -- Testing",
author = "Waipang Fong",
title = "Building a preprocessor for a multithreading
type = "Thesis ({M.E.E.})",
school = "Department of Electrical Engineering, University of
address = "Tuscaloosa, AL, USA",
pages = "ix + 80",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors; Parallel processing (Electronic
author = "M. Forsell",
title = "{MTAC} --- a Multithreaded {VLIW} Architecture for
{PRAM} Simulation",
journal = j-J-UCS,
volume = "3",
number = "9",
pages = "1037--1055",
day = "28",
month = sep,
year = "1997",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Mar 4 15:32:49 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://medoc.springer.de:8000/jucs/jucs_3_9/mtac_a_multithreaded_vliw",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "Ian Foster and Jonathan Geisler and Carl Kesselman and
Steven Tuecke",
title = "Managing Multiple Communication Methods in
High-Performance Networked Computing Systems",
journal = j-J-PAR-DIST-COMP,
volume = "40",
number = "1",
pages = "35--48",
day = "10",
month = jan,
year = "1997",
DOI = "https://doi.org/10.1006/jpdc.1996.1266",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:01 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production;
acknowledgement = ack-nhfb,
classification = "B6150M (Protocols); B6210L (Computer
communications); C5440 (Multiprocessing systems); C5470
(Performance evaluation and testing); C5640
(Protocols); C5670 (Network performance)",
corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "Argonne MPICH library; computer networks; computing
systems; criteria; heterogeneous networked environment;
high-performance networked; message passing; message
passing interface; multimethod communication; multiple
communication methods; multithreaded runtime system;
networked computing environments; Nexus; Nexus-based
MPI implementation; performance characteristics;
performance evaluation; protocols; remote service
request mechanisms; transport mechanisms;
user-specified selection",
treatment = "P Practical",
author = "Tetsuya Theodore Fujita",
title = "A multithreaded processor architecture for parallel
symbolic computation",
type = "Technical Report",
number = "MIT/LCS/TM-338",
institution = "Laboratory for Computer Science, Massachusetts
Institute of Technology",
address = "Cambridge, MA, USA",
pages = "71",
month = sep,
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Computer architecture; Multilisp (Computer program
language); Parallel processing (Electronic computers)",
author = "Seth Copen Goldstein",
title = "Lazy threads: compiler and runtime structures for
fine-grained parallel programming",
type = "Thesis ({Ph.D.})",
number = "UCB/CSD-97-975",
school = "Computer Science Division, University of California,
address = "Berkeley, CA, USA",
pages = "xi + 174",
year = "1997",
LCCN = "TK7885.A1 R46 no.97:975",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Report",
acknowledgement = ack-nhfb,
author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam",
title = "Efficient Multithreaded User-Space Transport for
Network Computing: Design and Test of the {TRAP}
journal = j-J-PAR-DIST-COMP,
volume = "40",
number = "1",
pages = "103--117",
day = "10",
month = jan,
year = "1997",
DOI = "https://doi.org/10.1006/jpdc.1996.1269",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:01 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production;
acknowledgement = ack-nhfb,
classification = "B6150M (Protocols); B6210L (Computer
communications); C5620 (Computer networks and
techniques); C5640 (Protocols); C6150G (Diagnostic,
testing, debugging and evaluating systems); C6150N
(Distributed systems software)",
corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
keywords = "communicating; communication; computer networks;
computing; computing nodes; efficient multithreaded
user-space transport; high-; low-latency; message
passing; multithreaded message-passing libraries;
network; nodes; performance distributed computing
applications; processing; runtime performance;
scalability characteristics; software libraries;
software performance evaluation; testing; transaction;
transaction-oriented protocol; transport protocols;
TRAP protocol design; TRAP protocol testing; TRAP-based
communication library; user-space protocol",
treatment = "P Practical",
author = "B. Goossens",
title = "A Multithreaded Vector Co-processor",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1277",
pages = "311--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Ian Gorton and Innes E. Jelly",
title = "{Guest Editors} Introduction: Software Engineering for
Parallel and Distributed Systems: Challenges and
journal = j-IEEE-CONCURR,
volume = "5",
number = "3",
pages = "12--15",
month = jul # "\slash " # sep,
year = "1997",
ISSN = "1092-3063 (print), 1558-0849 (electronic)",
ISSN-L = "1092-3063",
bibdate = "Tue Jan 16 06:04:48 MST 2001",
bibsource = "Compendex database;
URL = "http://dlib.computer.org/pd/books/pd1997/pdf/p3012.pdf",
acknowledgement = ack-nhfb,
affiliation = "Commonwealth Science and Industrial Research
affiliationaddress = "Aust",
classification = "722; 722.4; 723; 723.1; 723.3",
fjournal = "IEEE Concurrency",
journalabr = "IEEE Concurrency",
keywords = "Computer workstations; Concurrency control; Fault
tolerant computer systems; High performance computing;
Multithreaded servers; Parallel processing systems;
Program debugging; Program diagnostics; Software
engineering; World wide web",
author = "B. K. Gunther",
title = "Multithreading with distributed functional units",
journal = j-IEEE-TRANS-COMPUT,
volume = "46",
number = "4",
pages = "399--411",
month = apr,
year = "1997",
DOI = "https://doi.org/10.1109/12.588034",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Jul 6 10:06:22 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=588034",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Matthew Haines",
title = "On designing lightweight threads for substrate
number = "201645",
publisher = pub-NTIS,
address = pub-NTIS:adr,
pages = "??",
year = "1997",
LCCN = "DOC NAS 1.26:201645 mf11",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Shipping list number 98-0847-M.",
series = "NASA contractor report",
acknowledgement = ack-nhfb,
keywords = "operating systems (computers); parallel computers;
parallel processing (computers); threads",
author = "Matthew Haines and Piyush Mehrotra and David Cronk",
title = "Data-parallel programming in a multithreaded
journal = j-SCI-PROG,
volume = "6",
number = "2",
pages = "187--200",
month = "Summer",
year = "1997",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Thu Mar 28 12:27:27 MST 2002",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Matthew Haines",
title = "An Open Implementation Analysis and Design for
Lightweight Threads",
journal = j-SIGPLAN,
volume = "32",
number = "10",
pages = "229--242",
month = oct,
year = "1997",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:39 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "David R. Hanson",
title = "{C} Interfaces and Implementations: Techniques for
Creating Reusable Software",
publisher = pub-AW,
address = pub-AW:adr,
pages = "xvii + 519",
year = "1997",
ISBN = "0-201-49841-3",
ISBN-13 = "978-0-201-49841-7",
LCCN = "QA76.73.C15H37 1997",
bibdate = "Fri Feb 27 16:08:11 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$37.95",
series = "Addison-Wesley Professional Computing Series",
URL = "http://www.cs.princeton.edu/software/cii/",
acknowledgement = ack-nhfb,
annote = "Multithreading is discussed in Chapter 20.",
author = "Laurie J. Hendren and Xinan Tang and Yingchun Zhu and
Shereen Ghobrial and Guang R. Gao and Xun Xue and
Haiying Cai and Pierre Ouellet",
title = "Compiling {C} for the {EARTH} Multithreaded
journal = j-INT-J-PARALLEL-PROG,
volume = "25",
number = "4",
pages = "305--338",
month = aug,
year = "1997",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Tue Apr 7 18:25:25 MDT 1998",
bibsource = "Compendex database;
acknowledgement = ack-nhfb,
affiliation = "McGill Univ",
affiliationaddress = "Montreal, Que, Can",
classification = "722; 722.4; 723; 723.1; 723.1.1; 723.2",
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
journalabr = "Int J Parallel Program",
keywords = "C (programming language); Codes (symbols); Computer
architecture; earth C programming language;
Multithreaded architecture; Parallel processing
systems; Program compilers; Program translators",
author = "Lauren Hightower",
title = "Publishing Dynamic Data on the {Internet} ---
{Allaire's Cold Fusion} is a development tool that
provides access (via the {Web}) to any database the
{Web} server can access using {ODBC}. {Cold Fusion}
runs as a multithreaded {Windows NT} system service and
works with any {ODBC-compliant} database",
journal = j-DDJ,
volume = "22",
number = "1",
pages = "70--??",
month = jan,
year = "1997",
ISSN = "1044-789X",
bibdate = "Fri Jan 3 06:17:24 MST 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Cameron Hughes and Tracey Hughes",
title = "Object-oriented multithreading using {C++}",
publisher = pub-WILEY,
address = pub-WILEY:adr,
pages = "xvi + 495",
year = "1997",
ISBN = "0-471-18012-2 (paperback)",
ISBN-13 = "978-0-471-18012-8 (paperback)",
LCCN = "QA76.73.C153H84 1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "System requirements: Windows 95, or OS/2 2.0 and
above, or UNIX, or system with POSIX pthreads; ANSI/ISO
compliant C++ compiler.",
keywords = "C++ (Computer program language); POSIX (Computer
software standard); Threads (Computer programs)",
author = "P. Kacsuk and M. Amamiya",
title = "A Multithreaded Implementation Concept of {Prolog} on
{Datarol-II} Machine",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1336",
pages = "91--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Harold R. Kasperink and John C. Dekker",
title = "Concurrent Database Commands and {C++}",
journal = j-DDJ,
volume = "22",
number = "8",
pages = "84, 86, 88, 89, 98",
month = aug,
year = "1997",
ISSN = "1044-789X",
bibdate = "Sat Aug 23 07:57:02 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Mapping design problems to programming problems leads
to software solutions that are easy to extend and
reuse. Our authors explain how they resolved
multithreaded porting problems using design patterns.
The database they use is Oracle and the database
transactions are implemented using Oracle ProC as an
embedded database command language.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Samir Khosla",
title = "Multithreading the asynchronous trigger processor",
type = "Thesis ({M.S.})",
school = "University of Florida",
address = "Gainesville, FL, USA",
pages = "ix + 57",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Panos Kougiouris and Marco Framba",
title = "A Portable Multithreading Framework",
journal = j-CCCUJ,
volume = "15",
number = "8",
pages = "??--??",
month = aug,
year = "1997",
ISSN = "1075-2838",
bibdate = "Wed Aug 20 10:44:42 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Orran Krieger and Michael Stumm",
title = "{HFS}: a Performance-Oriented Flexible File System
Based on Building-Block Compositions",
journal = j-TOCS,
volume = "15",
number = "3",
pages = "286--321",
month = aug,
year = "1997",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p286-krieger/",
abstract = "The Hurricane File System (HFS) is designed for
(potentially large-scale) shared-memory
multiprocessors. Its architecture is based on the
principle that, in order to maximize performance for
applications with diverse requirements, a file system
must support a wide variety of file structures, file
system policies, and I/O interfaces. Files in HFS are
implemented using simple building blocks composed in
potentially complex ways. This approach yields great
flexibility, allowing an application to customize the
structure and policies of a file to exactly meet its
requirements. As an extreme example, HFS allows a
file's structure to be optimized for concurrent
random-access write-only operations by 10 threads,
something no other file system can do. Similarly, the
prefetching, locking, and file cache management
policies can all be chosen to match an application's
access pattern. In contrast, most parallel file systems
support a single file structure and a small set of
policies. We have implemented HFS as part of the
Hurricane operating system running on the Hector
shared-memory multiprocessor. We demonstrate that the
flexibility of HFS comes with little processing or I/O
overhead. We also show that for a number of file access
patterns, HFS is able to deliver to the applications
the full I/O bandwidth of the disks on our system.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "design; performance",
subject = "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems
Management, File organization. {\bf D.4.3} Software,
OPERATING SYSTEMS, File Systems Management, Access
methods. {\bf D.4.8} Software, OPERATING SYSTEMS,
Performance, Measurements. {\bf E.5} Data, FILES,
Optimization**. {\bf E.5} Data, FILES,
author = "H. Kwak and B. Lee and A. R. Hurson",
title = "Viability of Multithreading on Networks of
journal = j-LECT-NOTES-COMP-SCI,
volume = "1277",
pages = "216--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Duncan Walter Temple Lang",
title = "A multi-threaded extension to a high level interactive
statistical computing environment",
type = "Thesis ({Ph.D. in Statistics})",
school = "University of California, Berkeley",
address = "Berkeley, CA, USA",
pages = "vii + 161",
month = dec,
year = "1997",
LCCN = "308t 1997 951",
bibdate = "Fri Aug 7 08:29:38 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Dissertations -- Academic -- UCB -- statistics --
1991--2000; University of California, Berkeley. Dept.
of Statistics -- Dissertations",
author = "Michael Larbi",
title = "Book Review: {Multithreading Applications in Win32}",
journal = j-CCCUJ,
volume = "15",
number = "7",
pages = "65--??",
month = jul,
year = "1997",
ISSN = "1075-2838",
bibdate = "Thu Jun 26 14:12:46 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "C. E. Leiserson",
title = "Algorithmic analysis of multithreaded algorithms",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1350",
pages = "132--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Peter J. Leven",
title = "A multithreaded implementation of a {Robot Control C
type = "Thesis ({M.S.})",
school = "University of Illinois at Urbana-Champaign",
address = "Urbana-Champaign, IL, USA",
pages = "x + 72",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and
Rebecca L. Stamm and Dean M. Tullsen",
title = "Converting Thread-Level Parallelism to
Instruction-Level Parallelism via Simultaneous
journal = j-TOCS,
volume = "15",
number = "3",
pages = "322--354",
month = aug,
year = "1997",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/",
abstract = "To achieve high performance, contemporary computer
systems rely on two forms of parallelism:
instruction-level parallelism (ILP) and thread-level
parallelism (TLP). Wide-issue super-scalar processors
exploit ILP by executing multiple instructions from a
single program in a single cycle. Multiprocessors (MP)
exploit TLP by executing different threads in parallel
on different processors. Unfortunately, both parallel
processing styles statically partition processor
resources, thus preventing them from adapting to
dynamically changing levels of ILP and TLP in a
program. With insufficient TLP, processors in an MP
will be idle; with insufficient ILP, multiple-issue
hardware on a superscalar is wasted. This article
explores parallel processing on an alternative
architecture, simultaneous multithreading (SMT), which
allows multiple threads to complete for and share all
of the processor's resources every cycle. The most
compelling reason for running parallel applications on
an SMT processor is its ability to use thread-level
parallelism and instruction-level parallelism
interchangeably. By permitting multiple threads to
share the processor's functional units simultaneously,
the processor can use both ILP and TLP to accommodate
variations in parallelism. When a program has only a
single thread, all of the SMT processor's resources can
be dedicated to that thread; when more TLP exists, this
parallelism can compensate for a lack of per-thread
ILP. We examine two alternative on-chip parallel
architectures for the next generation of processors. We
compare SMT and small-scale, on-chip multiprocessors in
their ability to exploit both ILP and TLP. First, we
identify the hardware bottlenecks that prevent
multiprocessors from effectively exploiting ILP. Then,
we show that because of its dynamic resource sharing,
SMT avoids these inefficiencies and benefits from being
able to run more threads on a single processor. The use
of TLP is especially advantageous when per-thread ILP
is limited. The ease of adding additional thread
contexts on an SMT (relative to adding additional
processors on an MP) allows simultaneous multithreading
to expose more parallelism, further increasing
functional unit utilization and attaining a 52\%
average speedup (versus a four-processor, single-chip
multiprocessor with comparable execution resources).
This study also addresses an often-cited concern
regarding the use of thread-level parallelism or
multithreading: interference in the memory system and
branch prediction hardware. We find the multiple
threads cause interthread interference in the caches
and place greater demands on the memory system, thus
increasing average memory latencies. By exploiting
threading-level parallelism, however, SMT hides these
additional latencies, so that they only have a small
impact on total program performance. We also find that
for parallel applications, the additional threads have
minimal effects on branch prediction.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "measurement; performance",
subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR
ARCHITECTURES, Multiple Data Stream Architectures
(Multiprocessors), Parallel processors**. {\bf C.0}
Computer Systems Organization, GENERAL, Instruction set
design. {\bf D.4.1} Software, OPERATING SYSTEMS,
Process Management.",
author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and
Rebecca L. Stamm and Dean M. Tullsen",
title = "Converting Thread-Level Parallelism to
Instruction-Level Parallelism via Simultaneous
journal = j-TOCS,
volume = "15",
number = "3",
pages = "322--354",
month = aug,
year = "1997",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/",
abstract = "To achieve high performance, contemporary computer
systems rely on two forms of parallelism:
instruction-level parallelism (ILP) and thread-level
parallelism (TLP). Wide-issue super-scalar processors
exploit ILP by executing multiple instructions from a
single program in a single cycle. Multiprocessors (MP)
exploit TLP by executing different threads in parallel
on different processors. Unfortunately, both parallel
processing styles statically partition processor
resources, thus preventing them from adapting to
dynamically changing levels of ILP and TLP in a
program. With insufficient TLP, processors in an MP
will be idle; with insufficient ILP, multiple-issue
hardware on a superscalar is wasted. This article
explores parallel processing on an alternative
architecture, simultaneous multithreading (SMT), which
allows multiple threads to complete for and share all
of the processor's resources every cycle. The most
compelling reason for running parallel applications on
an SMT processor is its ability to use thread-level
parallelism and instruction-level parallelism
interchangeably. By permitting multiple threads to
share the processor's functional units simultaneously,
the processor can use both ILP and TLP to accommodate
variations in parallelism. When a program has only a
single thread, all of the SMT processor's resources can
be dedicated to that thread; when more TLP exists, this
parallelism can compensate for a lack of per-thread
ILP. We examine two alternative on-chip parallel
architectures for the next generation of processors. We
compare SMT and small-scale, on-chip multiprocessors in
their ability to exploit both ILP and TLP. First, we
identify the hardware bottlenecks that prevent
multiprocessors from effectively exploiting ILP. Then,
we show that because of its dynamic resource sharing,
SMT avoids these inefficiencies and benefits from being
able to run more threads on a single processor. The use
of TLP is especially advantageous when per-thread ILP
is limited. The ease of adding additional thread
contexts on an SMT (relative to adding additional
processors on an MP) allows simultaneous multithreading
to expose more parallelism, further increasing
functional unit utilization and attaining a 52\%
average speedup (versus a four-processor, single-chip
multiprocessor with comparable execution resources).
This study also addresses an often-cited concern
regarding the use of thread-level parallelism or
multithreading: interference in the memory system and
branch prediction hardware. We find the multiple
threads cause interthread interference in the caches
and place greater demands on the memory system, thus
increasing average memory latencies. By exploiting
threading-level parallelism, however, SMT hides these
additional latencies, so that they only have a small
impact on total program performance. We also find that
for parallel applications, the additional threads have
minimal effects on branch prediction.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "measurement; performance",
subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR
ARCHITECTURES, Multiple Data Stream Architectures
(Multiprocessors), Parallel processors**. {\bf C.0}
Computer Systems Organization, GENERAL, Instruction set
design. {\bf D.4.1} Software, OPERATING SYSTEMS,
Process Management.",
author = "Joseph LoCocero and D. E. (Donald E.) Thomas",
title = "A multithreaded, multiple language hardware\slash
software cosimulator",
type = "Research report",
number = "CMUCAD-97-13",
institution = "Center for Electronic Design Automation, Carnegie
Mellon University",
address = "Pittsburgh, PA, USA",
pages = "7",
month = apr,
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Functional verification of mixed hardware/software
systems is vital to guaranteeing a correct, operational
system. This paper discusses a new multithreaded,
multiple-language cosimulator that directly combines
Verilog and C/C++, the native languages most often used
by hardware and software designers. The interface
between the two languages is specified in detail, as
are some illustrative examples. The performance is
shown to be clearly better than UNIX socket-based
cosimulation approaches. Further, it naturally fits a
cosimulation environment where arbitrary C++ programs
and Verilog descriptions are developed concurrently.",
acknowledgement = ack-nhfb,
annote = "Supported in part by Semiconductor Research
keywords = "C (Computer program language); Embedded computer
systems -- Simulation methods; Verilog (Computer
hardware description language)",
author = "G. Loeffler",
title = "A Multithreaded {Java} Framework for Solving Linear
Elliptic Partial Differential Equations in {3D}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1343",
pages = "121--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "L. Lundberg",
title = "Bounding the Minimal Completion Time of Static
Mappings of Multithreaded {Solaris} Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1300",
pages = "1034--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "R. M. Mateosian",
title = "Micro News: {DARPA} aids {Tera MTA}",
journal = j-IEEE-MICRO,
volume = "17",
number = "5",
pages = "5--6",
month = sep # "\slash " # oct,
year = "1997",
DOI = "https://doi.org/10.1109/MM.1997.621216",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Dec 14 06:08:58 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
Science Citation Index database (1980--2000)",
URL = "http://dlib.computer.org/mi/books/mi1997/pdf/m5005.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Martin McCarthy",
title = "Multi-Threading: Intermediate Concepts",
journal = j-LINUX-J,
volume = "36",
pages = "??--??",
month = apr,
year = "1997",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Fri Oct 9 08:35:26 MDT 1998",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue36/index.html;
URL = "ftp://ftp.ssc.com/pub/lj/listings/issue36/2121.tgz",
abstract = "This second part of a series on Multi-threading deals
with how to use C programs with one of the POSIX
packages available for Linux to handle signals and
concurrent threads in global data.",
acknowledgement = ack-nhfb,
fjournal = "Linux Journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "Martin McCarthy",
title = "What is Multi-Threading?",
journal = j-LINUX-J,
volume = "34",
pages = "??--??",
month = feb,
year = "1997",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Fri Oct 9 08:35:26 MDT 1998",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html;
abstract = "A primer on multi-threading: the process whereby Linux
manages several tasks simultaneously.",
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "Robert McMillan",
title = "News: {Sun} boosts {Java} performance, adding {JIT}
compiler and {JVM} with multithreading to {Solaris
journal = j-JAVAWORLD,
volume = "2",
number = "7",
pages = "??--??",
month = jul,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:27 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-speedway.htm",
acknowledgement = ack-nhfb,
author = "E. D. Moreno and S. T. Kofuji and M. H. Cintra",
title = "Prefetching and Multithreading Performance in
Bus-Based Multiprocessors with {Petri} Nets",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1300",
pages = "1017--??",
year = "1997",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 28 08:51:33 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Richard Neves and Robert B. Schnabel",
title = "Threaded Runtime Support for Execution of Fine Grain
Parallel Code on Coarse Grain Multiprocessors",
journal = j-J-PAR-DIST-COMP,
volume = "42",
number = "2",
pages = "128--142",
day = "1",
month = may,
year = "1997",
DOI = "https://doi.org/10.1006/jpdc.1997.1322",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:02 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Scott Oaks and Henry Wong",
title = "{Java} threads",
publisher = pub-ORA,
address = pub-ORA:adr,
pages = "xiii + 252",
year = "1997",
ISBN = "1-56592-216-6",
ISBN-13 = "978-1-56592-216-7",
LCCN = "QA76.73.J38 O25 1997",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Java series",
acknowledgement = ack-nhfb,
keywords = "java (computer program language); threads (computer
author = "Songpol Ongwattanakul",
title = "A runtime distributed multithreading library for the
{PARC} language",
type = "Thesis ({M.E.E.})",
school = "Department of Electrical Engineering, University of
address = "Tuscaloosa, AL, USA",
pages = "viii + 71",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Multiprocessors; Parallel processing (Electronic
author = "F. Onion",
title = "Multithreading in {MFC}",
journal = j-C-PLUS-PLUS-REPORT,
volume = "9",
number = "3",
pages = "50--53, 56",
month = mar,
year = "1997",
ISSN = "1040-6042",
bibdate = "Thu Apr 24 09:46:14 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6115
(Programming support); C6150J (Operating systems)",
fjournal = "C++ Report",
keywords = "API calls; application program interfaces; Internet
queries; MFC; multiprogramming; multithreaded
programming; object oriented programming;
object-oriented programming; remote database hits;
software libraries; software tools; threads; user
interface; user interfaces; Windows",
treatment = "P Practical",
author = "Sung-Yong Park and Salim Hariri",
title = "A High Performance Message Passing System for {Network
of Workstations}",
volume = "11",
number = "2",
pages = "159--180",
month = oct,
year = "1997",
DOI = "https://doi.org/10.1023/A:1007912007767",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Jul 6 12:13:07 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=2&spage=159;
acknowledgement = ack-nhfb,
classification = "C5620W (Other computer networks); C6150N
(Distributed systems software)",
corpsource = "Dept. of Electr. and Comput. Eng., Syracuse Univ., NY,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
keywords = "application programming interface; asynchronous
transfer mode; ATM; ATM network; device driver;
distributed computing; high performance; message
passing; message-passing system; multithreaded
message-passing system; NCS; network of workstations;
NOW environment; NYNET; wide area network; wide area
pubcountry = "Netherlands",
treatment = "P Practical",
author = "Shashi Prasad",
title = "Multithreading programming techniques",
publisher = pub-MCGRAW-HILL,
address = pub-MCGRAW-HILL:adr,
pages = "xix + 410",
year = "1997",
ISBN = "0-07-912250-7, 0-07-050710-4 (Computer disk)",
ISBN-13 = "978-0-07-912250-6, 978-0-07-050710-4 (Computer disk)",
LCCN = "QA76.76.D47 P72 1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "The J. Ranade workstation series",
acknowledgement = ack-nhfb,
annote = "System requirements: C programming language.",
keywords = "Application software -- Development; C (Computer
program language); Cross-platform software
author = "Suresh B. Ravoor and Johnny S. K. Wong",
title = "Multithreaded Transaction Processing in Distributed
journal = j-J-SYST-SOFTW,
volume = "38",
number = "2",
pages = "107--117",
month = aug,
year = "1997",
ISSN = "0164-1212 (print), 1873-1228 (electronic)",
ISSN-L = "0164-1212",
bibdate = "Wed Dec 16 08:24:49 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The Journal of systems and software",
journal-URL = "http://www.sciencedirect.com/science/journal/01641212",
author = "Stefan Savage and Michael Burrows and Greg Nelson and
Patrick Sobalvarro and Thomas Anderson",
title = "{Eraser}: a Dynamic Data Race Detector for
Multithreaded Programs",
journal = j-TOCS,
volume = "15",
number = "4",
pages = "391--411",
month = nov,
year = "1997",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 13 18:36:53 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
note = "Co-published in {\em Operating Systems Review}, {\bf
URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p391-savage/",
abstract = "Multithreaded programming is difficult and error
prone. It is easy to make a mistake in synchronization
that produces a data race, yet it can be extremely hard
to locate this mistake during debugging. This article
describes a new tool, called Eraser, for dynamically
detecting data races in lock-based multithreaded
programs. Eraser uses binary rewriting techniques to
monitor every shared-monory reference and verify that
consistent locking behavior is observed. We present
several case studies, including undergraduate
coursework and a multithreaded Web search engine, that
demonstrate the effectiveness of this approach.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "algorithms; experimentation; reliability",
subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing
and Debugging, Monitors. {\bf D.1.3} Software,
PROGRAMMING TECHNIQUES, Concurrent Programming,
Parallel programming. {\bf D.2.5} Software, SOFTWARE
ENGINEERING, Testing and Debugging, Debugging aids.
{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and
Debugging, Tracing. {\bf D.4.1} Software, OPERATING
SYSTEMS, Process Management, Concurrency. {\bf D.4.1}
Software, OPERATING SYSTEMS, Process Management,
Deadlocks. {\bf D.4.1} Software, OPERATING SYSTEMS,
Process Management,
Multiprocessing/multiprogramming/multitasking. {\bf
D.4.1} Software, OPERATING SYSTEMS, Process Management,
Mutual exclusion.",
author = "George Shepherd and Scot Wingo",
title = "Undocumented Corner: {ATL} and the {IUknown}
journal = j-DDJ,
volume = "22",
number = "8",
pages = "119--123",
month = aug,
year = "1997",
ISSN = "1044-789X",
bibdate = "Mon Aug 11 11:38:10 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "George and Scot continue their examination of
Microsoft's Active Template Library, this month looking
at the heart of ATL, including its support for
multithreading and its various implementations of
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Michael Shoffner",
title = "{Java} Step by Step: Write your own threaded
discussion forum",
journal = j-JAVAWORLD,
volume = "2",
number = "2",
pages = "??--??",
month = feb,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:24 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-02-1997/jw-02-step.htm",
acknowledgement = ack-nhfb,
author = "Michael Shoffner",
title = "{Java} Step By Step: Write your own threaded
discussion forum: The communications and server
journal = j-JAVAWORLD,
volume = "2",
number = "3",
pages = "??--??",
month = mar,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:25 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-03-1997/jw-03-step.htm",
acknowledgement = ack-nhfb,
author = "J. Sime",
title = "Guarded pointers: moving smart pointers into
multithreaded systems",
journal = j-C-PLUS-PLUS-REPORT,
volume = "9",
number = "4",
pages = "32--41",
month = apr,
year = "1997",
ISSN = "1040-6042",
bibdate = "Thu Apr 24 09:46:14 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110J (Object-oriented programming); C6120 (File
organisation); C6130 (Data handling techniques); C6150N
(Distributed systems software)",
fjournal = "C++ Report",
keywords = "abstract data types; C listings; concurrency control;
concurrency control pattern; data integrity; exception
handling; guarded pointers; multiprogramming;
multithreaded systems; object-oriented programming;
protected data resource; protection proxy pattern;
reference count lock; safety; smart pointers; thread
safety mechanisms",
treatment = "P Practical",
author = "Balaram Sinharoy",
title = "Optimized Thread Creation for Processor
journal = j-COMP-J,
volume = "40",
number = "6",
pages = "388--??",
month = "????",
year = "1997",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Wed Jul 21 09:55:15 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compj1990.bib;
URL = "http://www.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.body.html#AbstractSinharoy;
acknowledgement = ack-nhfb,
email-1 = "balaram@watson.ibm.com",
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
author = "Angela Sodan and Guang R. Gao and Olivier Maquelin and
Jens-Uwe Schultz and Xin-Min Tian",
title = "Experiences with Non-numeric Applications on
Multithreaded Architectures",
journal = j-SIGPLAN,
volume = "32",
number = "7",
pages = "124--135",
month = jul,
year = "1997",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:35 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Andrew Sohn and Mitsuhisa Sato and Namhoon Yoo and
Jean-Luc Gaudiot",
title = "Data and Workload Distribution in a Multithreaded
journal = j-J-PAR-DIST-COMP,
volume = "40",
number = "2",
pages = "256--264",
day = "1",
month = feb,
year = "1997",
DOI = "https://doi.org/10.1006/jpdc.1996.1262",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:02 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "David B. Stewart and Pradeep K. Khosla",
title = "Mechanisms for Detecting and Handling Timing Errors",
journal = j-CACM,
volume = "40",
number = "1",
pages = "87--93",
month = jan,
year = "1997",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Fri Oct 10 18:17:54 MDT 1997",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/citations/journals/cacm/1997-40-1/p87-stewart/",
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6130
(Data handling techniques); C6150J (Operating
corpsource = "Inst. for Adv. Comput. Studies, Maryland Univ.,
College Park, MD, USA",
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
keywords = "adaptive real-time scheduling; aperiodic servers;
Chimera; design; error handling; imprecise computation;
low-overhead policy-independent system; management;
operating systems (computers); performance; periodic
threads; real- time operating system; real-time
systems; real-time systems analysis; real-time threads;
reliability; scheduling; scheduling policies; software
fault tolerance; specifications; system failure;
theory; timing; timing error detection; worst-case
execution times",
subject = "{\bf K.6.3} Computing Milieux, MANAGEMENT OF COMPUTING
AND INFORMATION SYSTEMS, Software Management, Software
development. {\bf C.3} Computer Systems Organization,
Real-time systems. {\bf C.4} Computer Systems
treatment = "P Practical",
author = "Kenjiro Taura and Akinori Yonezawa",
title = "Fine-grain Multithreading with Minimal Compiler
Support --- a Cost Effective Approach to Implementing
Efficient Multithreading Languages",
journal = j-SIGPLAN,
volume = "32",
number = "5",
pages = "320--333",
month = may,
year = "1997",
ISBN = "0-89791-907-6",
ISBN-13 = "978-0-89791-907-4",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 13 12:37:28 MDT 1999",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/258915/index.html;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/258915/p320-taura/",
acknowledgement = ack-nhfb,
annote = "Published as part of the Proceedings of PLDI'97.",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "design; languages; measurement; performance;
standardization; theory",
subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
Processors, Compilers. {\bf D.3.3} Software,
PROGRAMMING LANGUAGES, Language Constructs and
Features, Data types and structures. {\bf D.3.2}
Classifications. {\bf D.3.4} Software, PROGRAMMING
LANGUAGES, Processors, Code generation. {\bf C.2.2}
Computer Systems Organization, COMPUTER-COMMUNICATION
NETWORKS, Network Protocols.",
author = "Duncan Walter {Temple Lang}",
title = "A multi-threaded extension to a high level interactive
statistical computing environment",
type = "Thesis ({Ph.D. in Statistics})",
school = "Dept. of Statistics, University of California,
address = "Berkeley, CA, USA",
pages = "vii + 161",
month = dec,
year = "1997",
bibdate = "Sat Apr 20 11:15:46 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "P. Thompson and G. Bumgardner",
title = "{Threads.h++}: a portable {C++} library for
multithreaded programming",
journal = j-C-PLUS-PLUS-REPORT,
volume = "9",
number = "3",
pages = "24--37",
month = mar,
year = "1997",
ISSN = "1040-6042",
bibdate = "Thu Apr 24 09:46:14 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6110J
(Object-oriented programming); C6115 (Programming
support); C6150J (Operating systems)",
fjournal = "C++ Report",
keywords = "application development; application program
interfaces; C language; low-level procedural API;
multiprocessor machines; multiprogramming;
multithreaded programming; object-oriented
abstractions; object-oriented languages;
object-oriented programming; operating systems;
portable C++ library; responsive performance; software
libraries; software portability; synchronisation;
synchronization; thread control; thread creation;
Threads.h++; Web browsers",
treatment = "P Practical",
author = "P. Thompson and G. Bumgardner",
title = "{Threads.h++}: a portable {C++} library for
multithreaded programming",
journal = j-C-PLUS-PLUS-REPORT,
volume = "9",
number = "3",
pages = "24--37",
month = mar,
year = "1997",
ISSN = "1040-6042",
bibdate = "Thu Apr 24 09:46:14 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6110J
(Object-oriented programming); C6115 (Programming
support); C6150J (Operating systems)",
fjournal = "C++ Report",
keywords = "application development; application program
interfaces; C language; low-level procedural API;
multiprocessor machines; multiprogramming;
multithreaded programming; object-oriented
abstractions; object-oriented languages;
object-oriented programming; operating systems;
portable C++ library; responsive performance; software
libraries; software portability; synchronisation;
synchronization; thread control; thread creation;
Threads.h++; Web browsers",
treatment = "P Practical",
author = "Jenn-Yuan Tsai",
title = "Performance study of a concurrent multithreaded
type = "Technical report",
number = "TR 97-034",
institution = "University of Minnesota, Dept. of Computer Science and
address = "Minneapolis, MN, USA",
pages = "24",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The performance of a concurrent multithreaded
architectural model, called superthreading [15], is
studied in this paper. It tries to integrate optimizing
compilation techniques and run-time hardware support to
exploit both thread-level and instruction-level
parallelism, as opposed to exploit only
instruction-level parallelism in existing superscalars.
The superthreaded architecture uses a thread pipelining
execution model to enhance the overlapping between
threads, and to facilitate data dependence enforcement
between threads through compiler-directed,
hardware-supported, thread-level control speculation
and run-time data dependence checking. We also evaluate
the performance of the superthreaded processor through
a detailed trace-driven simulator. Our results show
that the superthreaded execution model can obtain good
performance by exploiting both thread-level and
instruction-level parallelism in programs. We also
study the design parameters of its main system
components, such as the size of the memory buffer, the
bandwidth requirement of the communication links
between thread processing units, and the bandwidth
requirement of the shared data cache.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the National Science Foundation.
Supported in part by the U.S. Army Intelligence Center
and Fort Huachuca. Supported in part by a gift from
Intel Corporation",
keywords = "Compilers (Computer programs); Computer architecture;
Parallel processing (Electronic computers); Threads
(Computer programs)",
author = "Jenn-Yuan Tsai",
title = "Superthreading: integrating compilation technology and
processor architecture for cost-effective concurrent
type = "Technical report",
number = "TR 97-033",
institution = "University of Minnesota, Dept. of Computer Science and
address = "Minneapolis, MN, USA",
pages = "16",
day = "29",
month = jan,
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As the number of transistors that can be integrated on
a single chip continues to grow, it is important for
computer architects to think beyond the traditional
approaches of deeper pipelines and wider instruction
issue units for improving performance. This
single-threaded execution model limits these approaches
to exploiting only the relatively small amount of
instruction-level parallelism available in application
programs. While integrating an entire multiprocessor
onto a single chip is feasible, this architecture is
limited to exploiting only relatively coarse-grained
heavy-weight parallelism. We propose the superthreaded
architecture as an excellent alternative for utilizing
the large number of transistors that will become
available on a single high-density chip. As a hybrid of
a wide-issue superscalar processor and a
multiprocessor-on-a-chip, this new concurrent
multithreading architecture can leverage the best of
existing and future parallel hardware and software
technologies. By incorporating speculation for control
dependences and run-time checking of data dependences,
the superthreaded architecture can exploit the multiple
granularities of parallelism available in
general-purpose application programs to reduce the
execution time of a single program.",
acknowledgement = ack-nhfb,
annote = "Supported in part by the U.S. Army Intelligence Center
and Fort Huachuca. Supported in part by the National
Science Foundation. Supported in part by a gift from
the Intel Corporation",
keywords = "Compilers (Computer programs); Computer architecture;
Parallel processing (Electronic computers); Threads
(Computer programs)",
author = "Laurence Vanhelsuw{\'e}",
title = "Book Review: The {Java} {Threads} {API} makes it to
print media",
journal = j-JAVAWORLD,
volume = "2",
number = "7",
pages = "??--??",
month = jul,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:27 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-threads.htm",
acknowledgement = ack-nhfb,
author = "Laurence Vanhelsuw{\'e}",
title = "{JavaBeans}: properties, events, and thread safety",
journal = j-JAVAWORLD,
volume = "2",
number = "9",
pages = "??--??",
month = sep,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:28 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-09-1997/jw-09-raceconditions.htm",
acknowledgement = ack-nhfb,
author = "Bill Venners",
title = "Under the Hood: How the {Java} virtual machine
performs thread synchronization",
journal = j-JAVAWORLD,
volume = "2",
number = "7",
pages = "??--??",
month = jul,
year = "1997",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 14:52:27 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-hood.htm",
acknowledgement = ack-nhfb,
author = "Alain Vermeulen",
title = "{Java} Deadlock: The woes of multithreaded design",
journal = j-DDJ,
volume = "22",
number = "9",
pages = "52, 54--56, 88, 89",
month = sep,
year = "1997",
ISSN = "1044-789X",
bibdate = "Mon Aug 11 12:53:44 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Russell Weisz",
title = "More First Aid for the Thread Impaired: Cool Ways to
Take Advantage of Multithreading",
journal = j-MICROSOFT-SYS-J,
volume = "12",
number = "7",
pages = "33--??",
month = jul,
year = "1997",
ISSN = "0889-9932",
bibdate = "Sat Nov 7 10:33:30 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Microsoft Systems Journal",
author = "Steve Whittaker and Jerry Swanson and Jakov Kucan and
Candy Sidner",
title = "{TeleNotes}: managing lightweight interactions in the
journal = j-TOCHI,
volume = "4",
number = "2",
pages = "137--168",
month = jun,
year = "1997",
ISSN = "1073-0516 (print), 1557-7325 (electronic)",
ISSN-L = "1073-0516",
bibdate = "Tue Jan 19 05:49:17 MST 1999",
bibsource = "http://www.acm.org/pubs/contents/journals/tochi/;
URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1997-4-2/p137-whittaker/",
abstract = "Communication theories and technology have tended to
focus on extended, formal meetings and have neglected a
prevalent and vital form of workplace communication ---
namely, lightweight communication. Unlike formal,
extended meetings, lightweight interaction is brief,
informal, unplanned, and intermittent. We analyze
naturalistic data from a study of work-place
communication and derive five design criteria for
lightweight interaction systems. These criteria require
that systems for lightweight interaction support {\em
conversational tracking, rapid connection}, the ability
to {\em leave a message}, {\em context management}, and
{\em shared real-time objects}. Using these criteria,
we evaluate existing interpersonal communications
technologies. We then describe an implementation of a
system (TeleNotes) that is designed to support
lightweight interaction by meeting these criteria. The
interface metaphor allows communications to be based
around desktop objects, resembling ``sticky notes.''
These objects are also organized into ``desktop piles''
to support conversational threads and provide
mechanisms for initiating real-time audio, video, and
application sharing. We conducted informal user testing
of several system prototypes. Based on our findings,
outstanding issues concerning theory and systems design
for communication systems are outlined --- in
particular, with regard to the issue of managing
conversations over time.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer-Human Interaction",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756",
keywords = "human factors",
subject = "{\bf H.5.3} Information Systems, INFORMATION
Interfaces, Evaluation/methodology. {\bf H.1.2}
Information Systems, MODELS AND PRINCIPLES,
User/Machine Systems, Human factors. {\bf H.5.3}
PRESENTATION, Group and Organization Interfaces,
Asynchronous interaction. {\bf I.3.6} Computing
Methodologies, COMPUTER GRAPHICS, Methodology and
Techniques, Interaction techniques. {\bf H.5.3}
PRESENTATION, Group and Organization Interfaces,
Synchronous interaction. {\bf H.5.1} Information
Multimedia Information Systems,
author = "Greg Wilson",
title = "Bookshelf: Threads Primer: a Guide To Multithreaded
journal = j-IEEE-SOFTWARE,
volume = "14",
number = "5",
pages = "116--116",
month = sep # "\slash " # oct,
year = "1997",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Mon Sep 15 22:35:10 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
URL = "http://dlib.computer.org/so/books/so1997/pdf/s5115.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
author = "Chia Wei Yang",
title = "A multi-context uniprocessor: another multithreaded
type = "Thesis ({M.S.})",
school = "California Polytechnic State University",
address = "San Luis Obispo, CA, USA",
pages = "viii + 129",
year = "1997",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "Proposes a computer architecture model that adapts all
advantages from multithreaded models to a uniprocessor
keywords = "Computer architecture; Multiprocessors; Parallel
processing (Electronic Computers)",
author = "Jean-Marc Adamo",
title = "Multi-threaded object-oriented {MPI}-based message
passing interface: the {ARCH} library",
volume = "SECS 446",
publisher = pub-KLUWER,
address = pub-KLUWER:adr,
pages = "xiv + 185",
year = "1998",
ISBN = "0-7923-8165-3",
ISBN-13 = "978-0-7923-8165-5",
LCCN = "TK5102.5.A293 1998",
bibdate = "Fri Aug 7 08:29:38 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$120.00",
series = "The Kluwer international series in engineering and
computer science",
acknowledgement = ack-nhfb,
keywords = "data transmission systems; object-oriented programming
(computer science); threads (computer programs)",
libnote = "Not yet in my library.",
author = "R. M. Aiex and S. L. Martins and C. C. Ribeiro and N.
D. L. R. Rodriguez",
title = "Cooperative Multi-thread Parallel Tabu Search with an
Application to Circuit Partitioning",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1457",
pages = "310--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Paul Amaranth",
title = "A {Tcl}-based Multithreaded Test Harness",
crossref = "USENIX:1998:PSA",
pages = "??--??",
year = "1998",
bibdate = "Fri Oct 18 07:49:55 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://db.usenix.org/publications/library/proceedings/tcl98/amaranth.html",
acknowledgement = ack-nhfb,
author = "Anonymous",
title = "Multithreaded System",
journal = j-IEEE-MICRO,
volume = "18",
number = "3",
pages = "76--76",
month = may # "\slash " # jun,
year = "1998",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Dec 14 06:08:58 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
Science Citation Index database (1980--2000)",
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Anonymous",
title = "New Tools: Software Development: {Uniscape}'s
Internationalization Library; {Global Technologies}'
{Unix-to-NT} Solution; {KAI}'s Multithreaded {Java}
Debugging Tool; {Price Systems}' Parametric Forecasting
journal = j-COMPUTER,
volume = "31",
number = "6",
pages = "98, 102",
month = jun,
year = "1998",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Thu Jun 4 08:22:02 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer1990.bib;
URL = "http://dlib.computer.org/co/books/co1998/pdf/r6098.pdf",
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Steve Ball and John Miller Crawford",
title = "Multi-Threaded Assignment Surprises",
journal = j-JAVA-REPORT,
volume = "3",
number = "??",
pages = "??--??",
month = sep,
year = "1998",
ISSN = "1086-4660",
bibdate = "Sat Dec 26 13:52:53 1998",
bibsource = "http://archive.javareport.com/9809/html/from_pages/index.shtml;
URL = "http://archive.javareport.com/9809/html/from_pages/ftp_col1.shtml",
abstract = "A volatile brew is formed by mixing assignment and
threads. Perils and surprises lurk within the most
innocent-looking statement. We expose those perils and
surprises and point out where you need to proceed with
due caution to ensure the effective use of locked
acknowledgement = ack-nhfb,
author = "Gaurav Bangs and Peter Druschel and Jeffrey C. Mogul",
title = "Better operating system features for faster network
journal = j-SIGMETRICS,
volume = "26",
number = "3",
pages = "23--30",
month = dec,
year = "1998",
CODEN = "????",
DOI = "https://doi.org/10.1145/306225.306234",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:27:29 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Widely-used operating systems provide inadequate
support for large-scale Internet server applications.
Their algorithms and interfaces fail to efficiently
support either event-driven or multi-threaded servers.
They provide poor control over the scheduling and
management of machine resources, making it difficult to
provide robust and controlled service. We propose new
UNIX interfaces to improve scalability, and to provide
fine-grained scheduling and resource management.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Nelson H. F. Beebe",
title = "A Bibliography of Publications about Multithreading",
institution = inst-CSC,
address = inst-CSC:adr,
pages = "15",
day = "7",
month = aug,
year = "1998",
bibdate = "Sat Apr 11 10:26:14 1998",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/b/beebe-nelson-h-f.bib;
note = "This report is updated frequently.",
URL = "https://www.math.utah.edu/pub/tex/bib/index-table-m.html#multithreading",
author = "Edoardo Biagioni and Ken Cline and Peter Lee and Chris
Okasaki and Chris Stone",
title = "Safe-for-Space Threads in {Standard ML}",
volume = "11",
number = "2",
pages = "209--225",
month = dec,
year = "1998",
DOI = "https://doi.org/10.1023/A:1010016600604",
ISSN = "1388-3690 (print), 2212-0793 (electronic)",
ISSN-L = "1388-3690",
bibdate = "Wed Jul 6 15:50:28 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1388-3690&volume=11&issue=2;
OCLC Contents1st database",
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1388-3690&volume=11&issue=2&spage=209;
acknowledgement = ack-nhfb,
fjournal = "Higher-Order and Symbolic Computation",
author = "Lubomir Bic and Michael B. Dillencourt and Munehiro
title = "Mobile agents, {DSM}, coordination, and self-migrating
threads: a common framework",
type = "UCI-ICS technical report",
number = "98-33",
institution = "Information and Computer Science, University of
California, Irvine",
address = "Irvine, CA",
pages = "11",
day = "8",
month = oct,
year = "1998",
LCCN = "Z699 .C3 no.98-33",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "distributed shared memory; intelligent agents
(computer software)",
author = "Robert D. Blumofe and Charles E. Leiserson",
title = "Space-Efficient Scheduling of Multithreaded
journal = j-SIAM-J-COMPUT,
volume = "27",
number = "1",
pages = "202--229",
month = feb,
year = "1998",
ISSN = "0097-5397 (print), 1095-7111 (electronic)",
ISSN-L = "0097-5397",
bibdate = "Sat Dec 5 17:26:53 MST 1998",
bibsource = "http://epubs.siam.org/sam-bin/dbq/toclist/SICOMP/27/1;
URL = "http://epubs.siam.org/sam-bin/dbq/article/25947",
acknowledgement = ack-nhfb,
fjournal = "SIAM Journal on Computing",
journal-URL = "http://epubs.siam.org/sicomp",
author = "Sharon M. Brunett and John Thornley and Marrq
title = "An Initial Evaluation of the {Tera} Multithreaded
Architecture and Programming System Using the {C3I}
Parallel Benchmark Suite",
crossref = "ACM:1998:SHP",
pages = "??--??",
year = "1998",
bibdate = "Wed Mar 06 06:27:47 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Brunett1063/Index.htm",
acknowledgement = ack-nhfb,
author = "Denis Caromel and Julien Vayssiere",
title = "A {Java} Framework for Seamless Sequential,
Multi-threaded, and Distributed Programming",
crossref = "ACM:1998:AWJ",
pages = "??--??",
year = "1998",
bibdate = "Thu Apr 27 10:43:08 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.cs.ucsb.edu/conferences/java98/papers/javapp.pdf;
acknowledgement = ack-nhfb,
author = "B. Chapman and P. Mehrotra",
title = "{OpenMP} and {HPF}: Integrating Two Paradigms",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1470",
pages = "650--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Jiajun Chen and Xiaodong Yuan and Guolian Zhengp",
title = "A multi-threaded object-oriented programming model",
journal = j-SIGSOFT,
volume = "23",
number = "3",
pages = "83--86",
month = may,
year = "1998",
DOI = "https://doi.org/10.1145/279437.279477",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:13:36 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This paper presents a concurrent object-oriented
programming (COOP) model established around concurrent
objects which may have a body. Once an object with a
body is created, its body begins to run as a separate
execution thread of the object. Distinguished from some
active-object-based concurrent object-oriented models,
the object body in our model is not used for the
concurrency control of objects, but only as a mechanism
to introduce concurrent executions into OO model.
Concurrency control is specified by the attributes of
objects and the control codes are generated by a
compiling system based on these attributes. In
addition, objects should be designed in such a way that
they can be used in both sequential and concurrent
environments, no matter whether they have a body or
not. In our model, several execution threads may
coexist in an object and some synchronization
mechanisms are provided to control the concurrent
executions of these threads. The paper presents two
examples of concurrent programming with our model.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Aaron Cohen and Mike Woodring",
title = "{Win32} Multithreaded Programming",
publisher = pub-ORA,
address = pub-ORA:adr,
pages = "xv + 705",
year = "1998",
ISBN = "1-56592-296-4",
ISBN-13 = "978-1-56592-296-9",
LCCN = "QA76.76.O63 C633 1998",
bibdate = "Fri Aug 7 08:29:38 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$39.95",
URL = "http://www.ora.com/catalog/multithread/;
acknowledgement = ack-nhfb,
keywords = "Microsoft Win32; Microsoft Windows (Computer file);
Operating systems (Computers)",
author = "Mike Criscolo",
title = "{Java Q\&A}: How Do {I} Queue {Java} Threads?",
journal = j-DDJ,
volume = "23",
number = "10",
pages = "127--129",
month = oct,
year = "1998",
ISSN = "1044-789X",
bibdate = "Fri Sep 11 09:12:05 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt;
abstract = "In examining queuing techniques in Java, Mike presents
one approach to multithreading he has implemented, and
examines the differences between centralized- and
distributed-queuing models. Additional resources
include jqa108.txt (listings) and jqa108.zip (source
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Mike Criscolo",
title = "{Java Q and A}: How Do {I} Queue {Java} Threads?",
journal = j-DDJ,
volume = "23",
number = "10",
pages = "127--129",
month = oct,
year = "1998",
ISSN = "1044-789X",
bibdate = "Fri Sep 11 09:12:05 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt;
abstract = "In examining queuing techniques in Java, Mike presents
one approach to multithreading he has implemented, and
examines the differences between centralized- and
distributed-queuing models. Additional resources
include jqa108.txt (listings) and jqa108.zip (source
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Jeff Cromwell",
title = "Programmer's Bookshelf: The Dawning of the Age of
journal = j-DDJ,
volume = "23",
number = "9",
pages = "127, 129",
month = sep,
year = "1998",
ISSN = "1044-789X",
bibdate = "Wed Aug 05 10:12:23 1998",
bibsource = "http://www.ddj.com/ddj/1998/1998_09/index.htm;
URL = "",
abstract = "Jeff's focus this month is multithreading, as he
examines {\em Multithreading Programming Techniques in
Win32}, by Jim Beveridge and R. Wiener, {\em
Object-Oriented Multithreading Using C++}, by Cameron
and Tracy Hughes, and {\em Multithreading Programming
Techniques}, by Shashi Prasad.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Leonardo Dagum and Ramesh Menon",
title = "{OpenMP}: An Industry-Standard {API} for Shared-Memory
journal = j-IEEE-COMPUT-SCI-ENG,
volume = "5",
number = "1",
pages = "46--55",
month = jan # "\slash " # mar,
year = "1998",
DOI = "https://doi.org/10.1109/99.660313",
ISSN = "1070-9924 (print), 1558-190X (electronic)",
ISSN-L = "1070-9924",
bibdate = "Sat Jan 9 08:57:23 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dlib.computer.org/cs/books/cs1998/pdf/c1046.pdf;
acknowledgement = ack-nhfb,
fjournal = "IEEE Computational Science \& Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99",
author = "Joe {DeRusso, III} and Peter Haggar",
title = "Multithreaded Exception Handling in {Java}",
journal = j-JAVA-REPORT,
volume = "3",
number = "??",
pages = "??--??",
month = aug,
year = "1998",
ISSN = "1086-4660",
bibdate = "Sat Dec 26 13:52:53 1998",
bibsource = "http://archive.javareport.com/9808/html/from_pages/index.shtml;
URL = "http://archive.javareport.com/9808/html/from_pages/ftp_feature.shtml",
abstract = "Introducing new classes and interfaces to be used when
writing multithreaded Java programs. These classes are
small, easy to use, and effectively enable you to
handle exceptions occurring on secondary threads.",
acknowledgement = ack-nhfb,
author = "Dave Dyer",
title = "Can {Assure} save {Java} from the perils of
journal = j-JAVAWORLD,
volume = "3",
number = "10",
pages = "??--??",
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Mon Jan 4 06:11:43 MST 1999",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-10-1998/jw-10-assure.htm",
acknowledgement = ack-nhfb,
author = "Jesper Eskilson and Mats Carlsson",
title = "{SICStus MT} --- a Multithreaded Execution
Environment for {SICStus Prolog}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1490",
pages = "36--53",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Feb 5 11:53:01 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1490.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1490/14900036.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Matteo Frigo and Charles E. Leiserson and Keith H.
title = "The Implementation of the {Cilk-5} Multithreaded
journal = j-SIGPLAN,
volume = "33",
number = "5",
pages = "212--223",
month = may,
year = "1998",
ISBN = "0-89791-987-4",
ISBN-13 = "978-0-89791-987-6",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:47 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p212-frigo/",
acknowledgement = ack-nhfb,
annote = "Published as part of the Proceedings of PLDI'98.",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "algorithms; languages; performance",
subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
Classifications, Concurrent, distributed, and parallel
languages. {\bf D.1.3} Software, PROGRAMMING
TECHNIQUES, Concurrent Programming, Parallel
programming. {\bf D.3.3} Software, PROGRAMMING
LANGUAGES, Language Constructs and Features, Control
structures. {\bf D.3.2} Software, PROGRAMMING
LANGUAGES, Language Classifications, C.",
author = "David Geary",
title = "{Swing} and multithreading",
journal = j-JAVA-REPORT,
volume = "3",
number = "??",
pages = "??--??",
month = nov,
year = "1998",
ISSN = "1086-4660",
bibdate = "Sat Dec 26 13:52:53 1998",
bibsource = "http://archive.javareport.com/9811/html/from_pages/index.shtml;
URL = "http://archive.javareport.com/9811/html/from_pages/ftp_col1.shtml",
abstract = "Read about why Swing is not thread-safe and the
ramifications of a single-threaded design for
developers using Swing.",
acknowledgement = ack-nhfb,
author = "Milind Girkar and Mohammad R. Haghighat and Paul Grey
and Hideki Saito and Nicholas Stavrakos and Constantine
D. Polychronopoulos",
title = "{Illinois-Intel} Multithreading Library:
Multithreading Support for {Intel} Architecture Based
Multiprocessor Systems",
journal = j-INTEL-TECH-J,
number = "Q1",
pages = "15",
year = "1998",
ISSN = "1535-766X",
bibdate = "Fri Jun 01 06:02:08 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://developer.intel.com/technology/itj/q11998/articles/art_5.htm;
acknowledgement = ack-nhfb,
author = "Prasad N. Golla and Eric C. Lin",
title = "A comparison of the effect of branch prediction on
multithreaded and scalar architectures",
journal = j-COMP-ARCH-NEWS,
volume = "26",
number = "4",
pages = "3--11",
month = sep,
year = "1998",
DOI = "https://doi.org/10.1145/1216475.1216476",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 12:06:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Speculative instructions execution requires dynamic
branch predictors to increase the performance of a
processor by executing from predicted branch target
routines. Conventional Scalar architectures such as the
Superscalar or Multiscalar architecture executes from a
single stream, while a Multithreaded architecture
executes from multiple streams at a time. Several
aggressive branch predictors have been proposed with
high prediction accuracies. Unfortunately, none of the
branch predictors can provide 100\% accuracy.
Therefore, there is an inherent limitation on
speculative execution in real implementation. In this
paper, we show that Multithreaded architecture is a
better candidate for utilizing speculative execution
than Scalar architectures. Generally the branch
prediction performance degradation is compounded for
larger window sizes on Scalar architectures, while for
a Multithreaded architecture, by increasing the number
of executing threads, we could sustain a higher
performance for a large aggregated speculative window
size. Hence, heavier workloads may increase performance
and utilization for Multithreaded architectures. We
present analytical and simulation results to support
our argument.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Prasad N. Golla and Eric C. Lin",
title = "Cache memory requirements for multithreaded
uniprocessor architecture",
type = "Technical paper",
number = "98-CSE-03",
institution = "Dept. of Computer Science and Engineering, Southern
Methodist University",
address = "Dallas, TX, USA",
pages = "32",
year = "1998",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "J. C. Gomez and E. Mascarenhas and V. Rego",
title = "The {CLAM} Approach to Multithreaded Communication on
Shared Memory Multiprocessors: Design and Experiments",
volume = "9",
number = "1",
pages = "36--49",
month = jan,
year = "1998",
DOI = "https://doi.org/10.1109/71.655241",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Nov 6 12:31:15 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dlib.computer.org/td/books/td1998/pdf/l0036.pdf;
acknowledgement = ack-nhfb,
classification = "B6150M (Protocols); B6210L (Computer
communications); C5440 (Multiprocessing systems); C5640
(Protocols); C5670 (Network performance)",
corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
keywords = "CLAM approach; communications environment; message
passing; multithreaded communication; OS-level process;
performance evaluation; protocols; scalable
multiprotocol support; scheduling algorithms; shared
memory systems; shared-memory multiprocessors;
user-space protocols",
treatment = "A Application; P Practical",
author = "T. Gruen and M. A. Hillebrand",
title = "{NAS} Integer Sort on Multi-threaded Shared Memory
journal = j-LECT-NOTES-COMP-SCI,
volume = "1470",
pages = "999--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "G. Heber and R. Biswas and P. Thulasiraman and G. R.
title = "Using Multithreading for the Automatic Load Balancing
of Adaptive Finite Element Meshes",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1457",
pages = "132--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Allen Holub",
title = "Programming {Java} threads in the real world:
Threading Architectures",
journal = j-JAVAWORLD,
volume = "3",
number = "9",
pages = "??--??",
month = sep,
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Sep 10 14:37:36 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.holub.com/goodies/javaworld/jw_index.html;
acknowledgement = ack-nhfb,
author = "Allen Holub",
title = "Programming {Java} threads in the real world, {Part}
2: Common multithreading Pitfalls (Deadlock, etc.)",
journal = j-JAVAWORLD,
volume = "3",
number = "10",
pages = "??--??",
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Mon Jan 4 06:11:43 MST 1999",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.holub.com/goodies/javaworld/jw_index.html;
acknowledgement = ack-nhfb,
author = "Allen Holub",
title = "Programming {Java} threads in the real world, {Part}
3: Semaphore, Lock\_manager, and Mutex",
journal = j-JAVAWORLD,
volume = "3",
number = "11",
pages = "??--??",
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Mon Jan 4 06:11:43 MST 1999",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.holub.com/goodies/javaworld/jw_index.html;
acknowledgement = ack-nhfb,
author = "Allen Holub",
title = "Programming {Java} threads in the real world, {Part}
4: Condition Variables and Counting Semaphores",
journal = j-JAVAWORLD,
volume = "3",
number = "12",
pages = "??--??",
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Mon Jan 4 06:22:03 MST 1999",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.holub.com/goodies/javaworld/jw_index.html;
acknowledgement = ack-nhfb,
author = "Michael A. Hopper",
title = "A compiler framework for multithreaded parallel
type = "Thesis ({Ph.D.})",
school = "School of Electrical and Computer Engineering, Georgia
Institute of Technology",
address = "Atlanta, GA, USA",
pages = "xii + 110",
year = "1998",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "Directed by William Appelbe.",
keywords = "Compilers (Computer programs); Parallel processing
(Electronic computers)",
author = "Brad Howes",
title = "Template processing classes for {Python}",
journal = j-DDJ,
volume = "23",
number = "2",
pages = "38, 40, 42, 44--46, 48, 100",
month = feb,
year = "1998",
ISSN = "1044-789X",
bibdate = "Thu May 21 19:02:04 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/dr-dobbs.bib;
abstract = "Brad shows how you can embed Python objects in HTML
pages using boilerplate template processing classes.
Then Python creator Guido van Rossum adds a note on
what's new in the just-released Python 1.5.",
acknowledgement = ack-nhfb,
classification = "C6130D (Document processing techniques); C6130M
(Multimedia); C6160J (Object- oriented databases)",
fjournal = "Dr. Dobb's Journal of Software Tools",
keywords = "application program interfaces; BoilerPlate; CGI
infrastructure; conditional control; Emacs; embedded
HTML text; errors; HTML document template; HTML
editing; hypermedia; iterative control; multithreaded
CGI service; object database; object paradigm;
object-oriented databases; page description languages;
persistent objects; placeholders; print statements;
Python; run- time values; run-time HTML generation;
syntax coloring; tagged locations; template HTML
constructs; template processing classes; text regions",
treatment = "P Practical",
author = "Ayal Itzkovitz and Assaf Schuster and Lea Shalev",
title = "Thread migration and its applications in distributed
shared memory systems",
journal = j-J-SYST-SOFTW,
volume = "42",
number = "1",
pages = "71--87",
month = jul,
year = "1998",
ISSN = "0164-1212 (print), 1873-1228 (electronic)",
ISSN-L = "0164-1212",
bibdate = "Thu Dec 17 14:07:21 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The Journal of systems and software",
journal-URL = "http://www.sciencedirect.com/science/journal/01641212",
author = "Minwen Ji and Edward W. Felten and Kai Li",
title = "Performance measurements for multithreaded programs",
journal = j-SIGMETRICS,
volume = "26",
number = "1",
pages = "161--170",
month = jun,
year = "1998",
CODEN = "????",
DOI = "https://doi.org/10.1145/277858.277900",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:25:18 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreaded programming is an effective way to
exploit concurrency, but it is difficult to debug and
tune a highly threaded program. This paper describes a
performance tool called Tmon for monitoring, analyzing
and tuning the performance of multithreaded programs.
The performance tool has two novel features: it uses
`thread waiting time' as a measure and constructs
thread waiting graphs to show thread dependencies and
thus performance bottlenecks, and it identifies
`semi-busy-waiting' points where CPU cycles are wasted
in condition checking and context switching. We have
implemented the Tmon tool and, as a case study, we have
used it to measure and tune a heavily threaded file
system. We used four workloads to tune different
aspects of the file system. We were able to improve the
file system bandwidth and throughput significantly. In
one case, we were able to improve the bandwidth by two
orders of magnitude.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Vijay Karamcheti and Andrew A. Chien",
title = "A Hierarchical Load-Balancing Framework for Dynamic
Multithreaded Computations",
crossref = "ACM:1998:SHP",
pages = "??--??",
year = "1998",
bibdate = "Wed Mar 06 06:31:50 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Karamcheti553/index.htm",
acknowledgement = ack-nhfb,
author = "Stephen W. Keckler and William J. Dally and Daniel
Maskit and Nicholas P. Carter and Andrew Chang and Whay
S. Lee",
title = "Exploiting fine-grain thread level parallelism on the
{MIT} multi-{ALU} processor",
journal = j-COMP-ARCH-NEWS,
volume = "26",
number = "3",
pages = "306--317",
month = jun,
year = "1998",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:58 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jens Krinke",
title = "Static Slicing of Threaded Programs",
journal = j-SIGPLAN,
volume = "33",
number = "7",
pages = "35--42",
month = jul,
year = "1998",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:49 MST 2003",
bibsource = "Compendex database; http://portal.acm.org/;
abstract = "Static program slicing is an established method for
analyzing sequential programs, especially for program
understanding, debugging and testing. Until now, there
was no slicing method for threaded programs which
handles interference correctly. We present such a
method which also calculates more precise static
slices. This paper extends the well known structures of
the control flow graph and the program dependence graph
for threaded programs with interference. This new
technique does not require serialization of threaded
acknowledgement = ack-nhfb,
affiliation = "Technische Universitaet Braunschweig",
affiliationaddress = "Braunschweig, Ger",
classification = "723; 723.1; 723.2; 723.5",
conference = "Proceedings of the 1998 ACM SIGPLAN\slash SIGSOFT
Workshop on Program Analysis for Software Tools and
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
journalabr = "ACM SIGPLAN SIGSOFT Workshop Program Anal Software
Tools Eng",
keywords = "Computer aided software engineering; Computer software
selection and evaluation; Control flow graphs; Data
flow analysis; Data structures; Program debugging;
Static program slicing; Threaded programs",
meetingaddress = "Montreal, Can",
meetingdate = "Jun 16 1998",
meetingdate2 = "06/16/98",
sponsor = "ACM",
author = "O. Krone and M. Raab and B. Hirsbrunner",
title = "Load Balancing for Network Based Multi-threaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1497",
pages = "206--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Jan 5 08:21:58 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Bil Lewis and Daniel J. Berg",
title = "Multithreaded programming with pthreads",
publisher = pub-SUN,
address = pub-SUN:adr,
pages = "xxx + 382",
year = "1998",
ISBN = "0-13-680729-1 (paperback)",
ISBN-13 = "978-0-13-680729-2 (paperback)",
LCCN = "QA76.76.T55 L49 1998",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.amazon.com/exec/obidos/ASIN/0136807291/ref=sim_books/002-4892305-5599452;
acknowledgement = ack-nhfb,
alttitle = "Pthreads",
keywords = "POSIX (Computer software standard); Threads (Computer
programs); UNIX (Computer file)",
author = "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J.
Eggers and Kourosh Gharachorloo and Henry M. Levy and
Sujay S. Parekh",
title = "An analysis of database workload performance on
simultaneous multithreaded processors",
journal = j-COMP-ARCH-NEWS,
volume = "26",
number = "3",
pages = "39--50",
month = jun,
year = "1998",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:58 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Honghui Lu",
title = "{OpenMP} on Networks of Workstations",
crossref = "ACM:1998:SHP",
pages = "??--??",
year = "1998",
bibdate = "Wed Oct 07 08:50:26 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
author = "Kevin T. Manley",
title = "General-Purpose Threads with {I/O} Completion Ports",
journal = j-CCCUJ,
volume = "16",
number = "4",
pages = "??--??",
month = apr,
year = "1998",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:15 MDT 2002",
bibsource = "http://www.cuj.com/articles/1998/9804/9804toc.htm?topic=articles;
abstract = "Divide and conquer is a good strategy for partitioning
a large job, provided you don't divide too much.
Windows NT helps you guess right.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Edward Mascarenhas and Vernon Rego",
title = "Migrant threads on process farms: parallel programming
with {Ariadne}",
journal = j-CPE,
volume = "10",
number = "9",
pages = "673--698",
day = "10",
month = aug,
year = "1998",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 06:06:42 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10008703;
acknowledgement = ack-nhfb,
fjournal = "Concurrency, practice and experience",
author = "Chuck McManis",
title = "In Depth: Using threads with collections, {Part 1}",
journal = j-JAVAWORLD,
volume = "3",
number = "3",
pages = "??--??",
month = mar,
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-03-1998/jw-03-indepth.html",
acknowledgement = ack-nhfb,
author = "Chuck McManis",
title = "{Java} In Depth: Using threads with collections, part
journal = j-JAVAWORLD,
volume = "3",
number = "6",
pages = "??--??",
month = jun,
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Aug 13 08:48:26 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-06-1998/jw-06-indepth.html",
acknowledgement = ack-nhfb,
author = "A. J. Nebro and E. Pimentel and J. M. Troya",
title = "Evaluating a Multithreaded Runtime System for
Concurrent Object-Oriented Languages",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1505",
pages = "167--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Jan 5 08:21:58 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Bradford Nichols and Dick Buttlar and Jacqueline
Proulx Farrell",
title = "Pthreads programming",
publisher = pub-ORA,
address = pub-ORA:adr,
pages = "xvi + 267",
year = "1998",
ISBN = "1-56592-115-1",
ISBN-13 = "978-1-56592-115-3",
LCCN = "QA76.642 .N53 1998",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Nutshell handbook",
acknowledgement = ack-nhfb,
annote = "A POSIX standard for better multiprocessing.",
keywords = "compilers (computer programs); parallel programming
(computer science)",
author = "Ian Piumarta and Fabio Riccardi",
title = "Optimizing Direct-threaded Code by Selective
journal = j-SIGPLAN,
volume = "33",
number = "5",
pages = "291--300",
month = may,
year = "1998",
ISBN = "0-89791-987-4",
ISBN-13 = "978-0-89791-987-6",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:47 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html;
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p291-piumarta/",
acknowledgement = ack-nhfb,
annote = "Published as part of the Proceedings of PLDI'98.",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "algorithms; experimentation; languages; performance",
subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES,
Processors, Optimization. {\bf D.3.4} Software,
PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf
D.3.4} Software, PROGRAMMING LANGUAGES, Processors,
Translator writing systems and compiler generators.",
author = "P. J. Plauger",
title = "{Standard C/C++}: Thread Safety",
journal = j-CCCUJ,
volume = "16",
number = "12",
pages = "??--??",
month = dec,
year = "1998",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:18 MDT 2002",
bibsource = "http://www.cuj.com/articles/1998/9812/9812toc.htm?topic=articles;
abstract = "The C++ Standard doesn't talk about thread safety, but
everyone else does.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Dave Pomerantz",
title = "{C++} Notifiers: Simplifying system development",
journal = j-DDJ,
volume = "23",
number = "8",
pages = "26, 28, 30--31, 89--90",
month = aug,
year = "1998",
ISSN = "1044-789X",
bibdate = "Thu Jul 16 13:01:59 MDT 1998",
bibsource = "http://www.ddj.com/ddj/1998/1998_08/;
URL = "http://www.ddj.com/ftp/1998/1998_08/notifier.txt;
abstract = "Notifiers, also called ``events'' or ``messages,'' are
used to pass information anonymously between objects.
Dave shows how notifiers can work in C++, using a
multithreaded application as an example.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Bill Reck",
title = "Thread Synchronization with Reference-Counting
journal = j-CCCUJ,
volume = "16",
number = "2",
pages = "??--??",
month = feb,
year = "1998",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:14 MDT 2002",
bibsource = "http://www.cuj.com/articles/1998/9802/9802toc.htm?topic=articles;
abstract = "Often, the best time to protect access to a shared
object is right when you reach for it.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "B. Reus and A. Knapp and P. Cenciarelli and M.
title = "Verifying a compiler optimization for Multi-Threaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1376",
pages = "402--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Gene Saghi and Kirk Reinholtz and Paul A. Savory",
title = "A Multithreaded Scheduler for a High-speed Spacecraft
journal = j-SPE,
volume = "28",
number = "6",
pages = "641--656",
month = may,
year = "1998",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Thu Jul 29 15:11:48 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=1802;
acknowledgement = ack-nhfb,
fjournal = "Software --- Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
author = "Douglas C. Schmidt",
title = "Evaluating architectures for multithreaded object
request brokers",
journal = j-CACM,
volume = "41",
number = "10",
pages = "54--60",
month = oct,
year = "1998",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Tue Oct 6 21:15:42 MDT 1998",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/journals/cacm/1998-41-10/p54-schmidt/",
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "S. S. Seiden",
title = "Randomized Online Multi-threaded Paging",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1432",
pages = "264--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Andrew Shaw and Arvind and Kyoo-Chan Cho and
Christopher Hill and R. Paul Johnson and John
title = "A Comparison of Implicitly Parallel Multithreaded and
Data-Parallel Implementations of an Ocean Model",
journal = j-J-PAR-DIST-COMP,
volume = "48",
number = "1",
pages = "1--51",
day = "10",
month = jan,
year = "1998",
DOI = "https://doi.org/10.1006/jpdc.1997.1390",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:04 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Andrew Shaw",
title = "Compiling for parallel multithreaded computation on
symmetric multiprocessors",
type = "Thesis ({Ph.D.})",
school = "Massachusetts Institute of Technology, Department of
Electrical Engineering and Computer Science",
address = "Cambridge, MA, USA",
pages = "149",
year = "1998",
bibdate = "Fri Aug 7 09:34:36 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Chin-Kuang Shene",
title = "Multithreaded programming in an introduction to
operating systems course",
journal = j-SIGCSE,
volume = "30",
number = "1",
pages = "242--246",
month = mar,
year = "1998",
DOI = "https://doi.org/10.1145/274790.274305",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:56:29 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This paper presents a way of teaching multithreaded
programming as a component in an introduction to
operating systems course. Topics include programming
assignments, term projects, and experiences. This paper
also suggests future work for overcoming a bottleneck
that occurs in the current version of this course.",
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "J. Silc and B. Robic and T. Ungerer",
title = "Asynchrony in Parallel Computing: From Dataflow to
volume = "1",
number = "1",
pages = "??--??",
month = "????",
year = "1998",
CODEN = "????",
ISSN = "1097-2803",
bibdate = "Fri Dec 19 08:14:11 MST 2003",
bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1.html;
URL = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1abs.html#silc",
acknowledgement = ack-nhfb,
fjournal = "PDCP: Parallel and Distributed Computing Practices",
author = "David B. Skillicorn and Domenico Talia",
title = "Models and languages for parallel computation",
journal = j-COMP-SURV,
volume = "30",
number = "2",
pages = "123--169",
month = jun,
year = "1998",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Fri Sep 11 08:35:51 MDT 1998",
bibsource = "http://www.acm.org/pubs/contents/journals/surveys/;
URL = "http://www.acm.org:80/pubs/citations/journals/surveys/1998-30-2/p123-skillicorn/",
abstract = "We survey parallel programming models and languages
using six criteria to assess their suitability for
realistic portable parallel programming. We argue that
an ideal model should by easy to program, should have a
software development methodology, should be
architecture-independent, should be easy to understand,
should guarantee performance, and should provide
accurate information about the cost of programs. These
criteria reflect our belief that developments in
parallelism must be driven by a parallel software
industry based on portability and efficiency. We
consider programming models in six categories,
depending on the level of abstraction they provide.
Those that are very abstract conceal even the presence
of parallelism at the software level. Such models make
software easy to build and port, but efficient and
predictable performance is usually hard to achieve. At
the other end of the spectrum, low-level models make
all of the messy issues of parallel programming
explicit (how many threads, how to place them, how to
express communication, and how to schedule
communication), so that software is hard to build and
not very portable, but is usually efficient. Most
recent models are near the center of this spectrum,
exploring the best tradeoffs between expressiveness and
performance. A few models have achieved both
abstractness and efficiency. Both kinds of models raise
the possibility of parallelism as part of the
mainstream of computing.",
acknowledgement = ack-nhfb,
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
keywords = "languages; performance; theory",
subject = "{\bf C.4} Computer Systems Organization, PERFORMANCE
{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language
author = "Geoffrey Smith and Dennis Volpano",
title = "Secure information flow in a multi-threaded imperative
crossref = "ACM:1998:CRP",
pages = "355--364",
year = "1998",
bibdate = "Mon May 3 12:57:52 MDT 1999",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p355-smith/",
acknowledgement = ack-nhfb,
keywords = "algorithms; languages; security; theory",
subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS
OF PROGRAMS, Studies of Program Constructs, Type
structure. {\bf D.3.0} Software, PROGRAMMING LANGUAGES,
General. {\bf D.2.0} Software, SOFTWARE ENGINEERING,
General, Protection mechanisms. {\bf D.1.3} Software,
PROGRAMMING TECHNIQUES, Concurrent Programming.",
author = "Patrick Tennberg",
title = "Creating Active Data Types via Multithreading",
journal = j-CCCUJ,
volume = "16",
number = "1",
pages = "??--??",
month = jan,
year = "1998",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:13 MDT 2002",
bibsource = "http://www.cuj.com/articles/1998/9801/9801toc.htm?topic=articles;
abstract = "If you need multiple active agents in a program, you
need multiple threads to synchronize them.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "K. Thitikamol and P. Keleher",
title = "Per-node multithreading and remote latency",
journal = j-IEEE-TRANS-COMPUT,
volume = "47",
number = "4",
pages = "414--426",
month = apr,
year = "1998",
DOI = "https://doi.org/10.1109/12.675711",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Jul 6 09:35:54 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=675711",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "John Thornley and K. Mani Chandy and Hiroshi Ishii",
title = "A System for Structured High-Performance Multithreaded
Programming in {Windows NT}",
crossref = "USENIX:1998:PUWa",
pages = "??--??",
year = "1998",
bibdate = "Fri Oct 18 07:49:55 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.usenix.org/publications/library/proceedings/usenix-nt98/thornley.html;
acknowledgement = ack-nhfb,
author = "J.-Y. Tsai and Z. Jiang and P.-C. Yew",
title = "Program Optimization for Concurrent Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1366",
pages = "146--??",
year = "1998",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Oct 10 14:40:24 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Dean M. Tullsen and Susan J. Eggers and Henry M.
title = "Retrospective: {Simultaneous} multithreading:
maximizing on-chip parallelism",
crossref = "ACM:1998:PAI",
pages = "115--116",
year = "1998",
bibdate = "Fri May 12 17:56:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
remark = "25 years of the International Symposia on Computer
Architecture (selected papers).",
author = "Dean M. Tullsen and Susan J. Eggers and Henry M.
title = "Simultaneous multithreading: maximizing on-chip
crossref = "ACM:1998:PAI",
pages = "533--544",
year = "1998",
bibdate = "Fri May 12 17:56:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
remark = "25 years of the International Symposia on Computer
Architecture (selected papers).",
author = "Bill Venners",
title = "Design for thread safety",
journal = j-JAVAWORLD,
volume = "3",
number = "8",
pages = "??--??",
month = aug,
year = "1998",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Sep 10 14:37:30 MDT 1998",
bibsource = "http://www.javaworld.com/javaworld/;
URL = "http://www.javaworld.com/javaworld/jw-08-1998/jw-08-techniques.htm",
acknowledgement = ack-nhfb,
author = "Uzi Vishkin and Shlomit Dascal and Efraim Berkovich
and Joseph Nuzman",
booktitle = "SPAA '98: 10th Annual ACM Symposium on Parallel
Algorithms and Architectures, June 28--July 2, 1998,
Puerto Vallarta, Mexico",
title = "Explicit multi-threading ({XMT}) bridging models for
instruction parallelism (extended abstract)",
publisher = pub-ACM,
address = pub-ACM:adr,
year = "1998",
DOI = "https://doi.org/10.1145.277680",
ISBN = "0-89791-989-0",
ISBN-13 = "978-0-89791-989-0",
LCCN = "QA76.58 .A26 1998",
bibdate = "Fri Jul 27 05:37:45 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order number 417980.",
URL = "http://delivery.acm.org/10.1145/280000/277680/p140-vishkin.pdf",
acknowledgement = ack-nhfb,
bookpages = "viii + 310",
keywords = "IA-64",
author = "Steven Wallace and Brad Calder and Dean M. Tullsen",
title = "Threaded multiple path execution",
journal = j-COMP-ARCH-NEWS,
volume = "26",
number = "3",
pages = "238--249",
month = jun,
year = "1998",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:58 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Boris Weissman",
title = "Active threads: towards efficient fine-grained
parallelism in object-oriented systems",
type = "Thesis ({Ph.D. in Computer Science})",
school = "Department of Computer Science, University of
California, Berkeley",
address = "Berkeley, CA, USA",
year = "1998",
LCCN = "T7.6.1998 W457",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "dissertations, academic -- UCB -- Computer Science --
1991--2000; University of California, Berkeley, Dept.
Of Computer Science -- dissertations",
author = "Boris Weissman",
title = "Performance Counters and State Sharing Annotations: a
Unified Approach to Thread Locality",
journal = j-SIGPLAN,
volume = "33",
number = "11",
pages = "127--138",
month = nov,
year = "1998",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:17:54 MST 2003",
bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/;
note = "Co-published in {\em Operating Systems Review}, {\bf
URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/291069/p127-weissman/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "design; experimentation; measurement; performance;
subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process
Management, Scheduling. {\bf F.1.2} Theory of
Computation, Parallelism and concurrency. {\bf D.4.8}
Software, OPERATING SYSTEMS, Performance, Simulation.
{\bf G.3} Mathematics of Computing, PROBABILITY AND
STATISTICS, Markov processes.",
author = "Norman Wilde and Christopher Casey and Joe Vandeville
and Gary Trio and Dick Hotz",
title = "Reverse engineering of software threads: a design
recovery technique for large multi-process systems",
journal = j-J-SYST-SOFTW,
volume = "43",
number = "1",
pages = "11--17",
month = oct,
year = "1998",
ISSN = "0164-1212 (print), 1873-1228 (electronic)",
ISSN-L = "0164-1212",
bibdate = "Wed Dec 16 08:24:49 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "The Journal of systems and software",
journal-URL = "http://www.sciencedirect.com/science/journal/01641212",
author = "Dick Wilmot",
title = "Data threaded microarchitecture",
journal = j-COMP-ARCH-NEWS,
volume = "26",
number = "5",
pages = "22--32",
month = dec,
year = "1998",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:21 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Honbo Zhou and Al Geist",
title = "{LPVM}: a step towards multithread {PVM}",
journal = j-CPE,
volume = "10",
number = "5",
pages = "407--416",
day = "25",
month = apr,
year = "1998",
ISSN = "1040-3108",
ISSN-L = "1040-3108",
bibdate = "Tue Sep 7 06:06:40 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5385;
acknowledgement = ack-nhfb,
fjournal = "Concurrency, practice and experience",
author = "Anonymous",
title = "Bookshelf: Surviving the Top Ten Challenges of
Software Development; The {Year 2000} Crisis; The
Continuing Challenge; Software Project Survival Guide;
Object-Oriented Multithreading Using {C++}",
journal = j-IEEE-SOFTWARE,
volume = "16",
number = "1",
pages = "114--??",
month = jan # "\slash " # feb,
year = "1999",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Thu Apr 1 16:52:57 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
URL = "http://dlib.computer.org/so/books/so1999/pdf/s1114.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
author = "G. Antoniu and L. Bouge and R. Namyst",
title = "An Efficient and Transparent Thread Migration Scheme
in the {PM2} Runtime System",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1586",
pages = "496--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Alain Azagury and Elliot K. Kolodner and Erez
title = "A Note on the Implementation of Replication-Based
Garbage Collection for Multithreaded Applications and
Multiprocessor Environments",
volume = "9",
number = "3",
pages = "391--??",
month = sep,
year = "1999",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Jan 6 12:02:35 MST 2005",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Robert D. Blumofe and Charles E. Leiserson",
title = "Scheduling multithreaded computations by work
journal = j-J-ACM,
volume = "46",
number = "5",
pages = "720--748",
month = sep,
year = "1999",
ISSN = "0004-5411 (print), 1557-735X (electronic)",
ISSN-L = "0004-5411",
bibdate = "Sun Jan 23 12:19:49 MST 2000",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/citations/journals/jacm/1999-46-5/p720-blumofe/",
acknowledgement = ack-nhfb,
fjournal = "Journal of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J401",
author = "L. Bouge and J.-F. Mehaut and R. Namyst",
title = "Efficient Communications in Multithreaded Runtime
journal = j-LECT-NOTES-COMP-SCI,
volume = "1586",
pages = "468--482",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Mar 16 07:33:54 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Allen Broadman and Eric Shaw",
title = "Executing a Class Member in Its Own Thread",
journal = j-CCCUJ,
volume = "17",
number = "12",
pages = "??--??",
month = dec,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:24 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9912/9912toc.htm?topic=articles;
abstract = "Creating a separate thread to execute a member
function call is a messy business that's often
necessary. It's a task well worth encapsulating.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "F. Cappello and O. Richard and D. Etiemble",
title = "Performance of the {NAS} Benchmarks on a Cluster of
{SMP PCs} Using a Parallelization of the {MPI} Programs
with {OpenMP}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1662",
pages = "339--350",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "P. Cenciarelli and A. Knapp and B. Reus and M.
title = "An Event-Based Structural Operational Semantics of
Multi-Threaded {Java}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1523",
pages = "157--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Robert S. Chappell and Jared Stark and Sangwook P. Kim
and Steven K. Reinhardt and Yale N. Patt",
title = "Simultaneous subordinate microthreading {(SSMT)}",
journal = j-COMP-ARCH-NEWS,
volume = "27",
number = "2",
pages = "186--195",
month = may,
year = "1999",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:49 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Shlomit Dascal and Uzi Vishkin",
title = "Experiments with List Ranking for Explicit
Multi-Threaded {(XMT)} Instruction Parallelism
(Extended Abstract)",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1668",
pages = "43--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Feb 4 12:03:08 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1668.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1668/16680043.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Juan A. de la Puente and Jos{\'e} F. Ruiz and
Jes{\'u}s M. Gonz{\'a}lez-Barahona",
title = "Real-Time Programming with {GNAT}: Specialized Kernels
versus {POSIX} Threads",
journal = j-SIGADA-LETTERS,
volume = "19",
number = "2",
pages = "73--77",
month = jun,
year = "1999",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Tue Aug 31 07:04:20 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGADA Ada Letters",
author = "Anthony DeWitt and Thomas Gross",
title = "The potential of thread-level speculation based on
value profiling",
journal = j-COMP-ARCH-NEWS,
volume = "27",
number = "1",
pages = "22--22",
month = mar,
year = "1999",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:35 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kenneth J. Duda and David R. Cheriton",
title = "Borrowed-virtual-time {(BVT)} scheduling: supporting
latency-sensitive threads in a general-purpose
journal = j-OPER-SYS-REV,
volume = "33",
number = "5",
pages = "261--276",
month = dec,
year = "1999",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:55 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "F. Garcia and A. Calderon and J. Carretero",
title = "{MiMPI}: a multithread-safe implementation of
crossref = "Dongarra:1999:RAP",
number = "1697",
pages = "207--214",
year = "1999",
bibdate = "Thu Dec 9 06:08:35 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "John Greiner and Guy E. Blelloch",
title = "A provably time-efficient parallel implementation of
full speculation",
journal = j-TOPLAS,
volume = "21",
number = "2",
pages = "240--285",
month = mar,
year = "1999",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Sep 26 10:12:58 MDT 2000",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-2/p240-greiner/",
abstract = "Speculative evaluation, including leniency and
futures, is often used to produce high degrees of
parallelism. Understanding the performance
characteristics of such evaluation, however, requires
having a detailed understanding of the implementation.
For example, the particular implementation technique
used to suspend and reactivate threads can have an
asymptotic effect on performance. With the goal of
giving the users some understanding of performance
without requiring them to understand the
implementation, we present a provable implementation
bound for a language based on speculative evaluation.
The idea is (1) to supply the users with a semantics
for a language that defines abstract costs for
measuring or analyzing the performance of computations,
(2) to supply the users with a mapping of these costs
onto runtimes on various machine models, and (3) to
describe an implementation strategy of the language and
prove that it meets these mappings. For this purpose we
consider a simple language based on speculative
evaluation. For every computation, the semantics of the
language returns a directed acyclic graph (DAG) in
which each node represents a unit of computation, and
each edge represents a dependence. We then describe an
implementation strategy of the language and show that
any computation with $w$ work (the number of nodes in
the DAG) and $d$ depth (the length of the longest path
in the DAG) will run on a $p$-processor PRAM in $ O(w /
p + d \log p) $ time. The bounds are work efficient
(within a constant factor of linear speedup) when there
is sufficient parallelism, $ w / d p \log p $. These
are the first time bounds we know of for languages with
speculative evaluation. The main challenge is in
parallelizing the necessary queuing operations on
suspended threads.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
generalterms = "Languages; Performance; Theory",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "abstract machines; parallel languages; profiling
semantics; speculation; threads",
subject = "Software --- Software Engineering --- Metrics (D.2.8);
Software --- Programming Languages --- Language
Classifications (D.3.2): {\bf Data-flow languages};
Software --- Programming Languages --- Language
Classifications (D.3.2); Theory of Computation ---
Computation by Abstract Devices --- Modes of
Computation (F.1.2): {\bf Parallelism and concurrency};
Theory of Computation --- Computation by Abstract
Devices --- Modes of Computation (F.1.2); Theory of
Computation --- Logics and Meanings of Programs ---
Specifying and Verifying and Reasoning about Programs
author = "Yan Gu and B. S. Lee and Wentong Cai",
title = "Evaluation of {Java} thread performance on two
different multithreaded kernels",
journal = j-OPER-SYS-REV,
volume = "33",
number = "1",
pages = "34--46",
month = jan,
year = "1999",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:37 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "John Harrington",
title = "{Win32} Multithreading Made Easy",
journal = j-CCCUJ,
volume = "17",
number = "8",
pages = "48, 50--52, 54--56",
month = aug,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:22 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9908/9908toc.htm?topic=articles;
abstract = "Multithreading logic is hard to write and hard to
maintain. So keep it simple and separate.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Allen Holub",
title = "Programming {Java} threads in the real world, {Part}
5: Timers",
journal = j-JAVAWORLD,
volume = "4",
number = "2",
pages = "??--??",
month = feb,
year = "1999",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Mar 04 12:56:16 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.holub.com/goodies/javaworld/jw_index.html",
acknowledgement = ack-nhfb,
author = "Allen Holub",
title = "Programming {Java} threads in the real world, {Part}
6: {Mach '99}: Observer and the Mysteries of the
journal = j-JAVAWORLD,
volume = "4",
number = "3",
pages = "??--??",
month = mar,
year = "1999",
CODEN = "????",
ISSN = "1091-8906",
bibdate = "Thu Mar 04 12:56:16 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.holub.com/goodies/javaworld/jw_index.html",
acknowledgement = ack-nhfb,
author = "J. Jonsson and H. Loenn and K. G. Shin",
title = "Non-preemptive Scheduling of Real-Time Threads on
Multi-Level-Context Architectures",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1586",
pages = "363--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Vijay Karamcheti and Andrew A. Chien",
title = "Architectural Support and Mechanisms for Object
Caching in Dynamic Multithreaded Computations",
journal = j-J-PAR-DIST-COMP,
volume = "58",
number = "2",
pages = "260--300",
month = aug,
year = "1999",
DOI = "https://doi.org/10.1006/jpdc.1999.1555",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:08 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "S. W. Kekckler and A. Chang and W. S. L. S. Chatterjee
and W. J. Dally",
title = "Concurrent event handling through multithreading",
journal = j-IEEE-TRANS-COMPUT,
volume = "48",
number = "9",
pages = "903--916",
month = sep,
year = "1999",
DOI = "https://doi.org/10.1109/12.795220",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Jul 6 08:46:59 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795220",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "V. Krishnan and J. Torrellas",
title = "A chip-multiprocessor architecture with speculative
journal = j-IEEE-TRANS-COMPUT,
volume = "48",
number = "9",
pages = "866--880",
month = sep,
year = "1999",
DOI = "https://doi.org/10.1109/12.795218",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Jul 6 08:46:59 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795218",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "S. Kusakabe and K. Inenaga and M. Amamiya and X.
title = "Implementing a Non-strict Functional Programming
Language on a Threaded Architecture",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1586",
pages = "138--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "H. Kwak and B. Lee and A. R. Hurson and Suk-Han Yoon
and Woo-Jong Hahn",
title = "Effects of multithreading on cache performance",
journal = j-IEEE-TRANS-COMPUT,
volume = "48",
number = "2",
pages = "176--184",
month = feb,
year = "1999",
DOI = "https://doi.org/10.1109/12.752659",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Jul 6 08:46:56 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=752659",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "J. L. Lo and S. S. Parekh and S. J. Eggers and H. M.
Levy and D. M. Tullsen",
title = "Software-Directed Register Deallocation for
Simultaneous Multithreaded Processors",
volume = "10",
number = "9",
pages = "922--??",
month = sep,
year = "1999",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Oct 12 18:48:31 MDT 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
URL = "http://dlib.computer.org/td/books/td1999/pdf/l0922.pdf;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Jack L. Lo and Susan J. Eggers and Henry M. Levy and
Sujay S. Parekh and Dean M. Tullsen",
title = "Tuning Compiler Optimizations for Simultaneous
journal = j-INT-J-PARALLEL-PROG,
volume = "27",
number = "6",
pages = "477--503",
month = dec,
year = "1999",
DOI = "https://doi.org/10.1023/A:1018780200739",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 6 16:39:54 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=6;
OCLC Contents1st database",
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=6&spage=477",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
remark = "Special Issue: {30th Annual ACM\slash IEEE
International Symposium on Microarchitecture}, Part
author = "Lars Lundberg",
title = "Predicting and Bounding the Speedup of Multithreaded
{Solaris} Programs",
journal = j-J-PAR-DIST-COMP,
volume = "57",
number = "3",
pages = "322--333",
month = jun,
year = "1999",
DOI = "https://doi.org/10.1006/jpdc.1999.1536",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:07 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Kevin Manley",
title = "Improving Performance with Thread-Private Heaps",
journal = j-CCCUJ,
volume = "17",
number = "9",
pages = "50--??",
month = sep,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:22 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9909/9909toc.htm?topic=articles;
abstract = "Threads interact in the darndest ways, but conflicts
with a common heap are particularly pernicious. Luckily
they can be avoided.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "P. Marcuello and A. Gonzalez",
title = "Exploiting Speculative Thread-Level Parallelism on a
{SMT} Processor",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1593",
pages = "754--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Brian Masney",
title = "Introduction to Multi-Threaded Programming",
journal = j-LINUX-J,
volume = "61",
pages = "??--??",
month = may,
year = "1999",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Thu Jun 3 06:34:02 MDT 1999",
bibsource = "http://www.linuxjournal.com/issue61/index.html;
abstract = "A description of thread programming basics.",
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "Avi Mendelson and Michael Bekerman",
title = "Design Alternatives of Multithreaded Architecture",
journal = j-INT-J-PARALLEL-PROG,
volume = "27",
number = "3",
pages = "161--193",
month = jun,
year = "1999",
DOI = "https://doi.org/10.1023/A:1018733528538",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 6 16:39:53 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=3;
OCLC Contents1st database",
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=3&spage=161",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Nicholas Mitchell and Larry Carter and Jeanne Ferrante
and Dean Tullsen",
title = "Instruction-level Parallelism vs. Thread-level
Parallelism on Simultaneous Multi-threading
crossref = "ACM:1999:SPO",
pages = "??--??",
year = "1999",
bibdate = "Thu Feb 24 09:02:57 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
author = "Scott Arthur Moody and Samuel Kwok and Dale Karr",
title = "{SimpleGraphics}: {Tcl\slash Tk} visualization of
real-time multi-threaded and distributed applications",
journal = j-SIGADA-LETTERS,
volume = "19",
number = "2",
pages = "60--66",
month = jun,
year = "1999",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Sat Aug 9 09:06:06 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAda Ada Letters",
author = "Girija J. Narlikar and Guy E. Blelloch",
title = "Space-Efficient Scheduling of Nested Parallelism",
journal = j-TOPLAS,
volume = "21",
number = "1",
pages = "138--173",
month = jan,
year = "1999",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Sep 26 10:12:58 MDT 2000",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-1/p138-narlikar/",
abstract = "Many of today's high-level parallel languages support
dynamic, fine-grained parallelism. These languages
allow the user to expose all the parallelism in the
program, which is typically of a much higher degree
than the number of processors. Hence an efficient
scheduling algorithm is required to assign computations
to processors at runtime. Besides having low overheads
and good load balancing, it is important for the
scheduling algorithm to minimize the space usage of the
parallel program. This article presents an on-line
scheduling algorithm that is provably space efficient
and time efficient for nested-parallel languages. For a
computation with depth $D$ and serial space requirement
$ S_1 $, the algorithm generates a schedule that
requires at most $ S_1 + O(K \cdot D \cdot p) $ space
(including scheduler space) on $p$ processors. Here,
$K$ is a user-adjustable runtime parameter specifying
the net amount of memory that a thread may allocate
before it is preempted by the scheduler. Adjusting the
value of $K$ provides a trade-off between the running
time and the memory requirement of a parallel
computation. To allow the scheduler to scale with the
number of processors we also parallelize the scheduler
and analyze the space and time bounds of the
computation to include scheduling costs. In addition to
showing that the scheduling algorithm is space and time
efficient in theory, we demonstrate that it is
effective in practice. We have implemented a runtime
system that uses our algorithm to schedule lightweight
parallel threads. The results of executing parallel
programs on this system show that our scheduling
algorithm significantly reduces memory usage compared
to previous techniques, without compromising
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
generalterms = "Algorithms; Languages; Performance",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "dynamic scheduling; multithreading; nested
parallelism; parallel language implementation; space
subject = "Software --- Programming Techniques --- Concurrent
Programming (D.1.3): {\bf Parallel programming};
Software --- Programming Languages --- Processors
(D.3.4): {\bf Run-time environments}; Theory of
Computation --- Analysis of Algorithms and Problem
Complexity --- General (F.2.0)",
author = "Z. Nemeth and H. Tomiyasu and P. Kacsuk and M.
title = "Multithreaded {LOGFLOW} on {KUMP\slash} {D}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1615",
pages = "320--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Christopher H. Nevison",
title = "Seminar: safe concurrent programming in {Java} with
journal = j-SIGCSE,
volume = "31",
number = "1",
pages = "367",
month = mar,
year = "1999",
DOI = "https://doi.org/10.1145/384266.299817",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:56:36 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present methods for safe and correct programming
for concurrent threads in Java. The methods are based
on the principles of Concurrent Sequential Processes
(CSP). We demonstrate the use of tools which provide
the structure of CSP within Java to avoid some of the
pitfalls of multithreaded programming using monitors,
the primitive synchronization tool in Java. Several
examples illustrate the use of these tools.",
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Scott Oaks and Henry Wong",
title = "{Java} threads",
publisher = pub-ORA,
address = pub-ORA:adr,
edition = "Second",
pages = "xiii + 319",
year = "1999",
ISBN = "1-56592-418-5",
ISBN-13 = "978-1-56592-418-5",
LCCN = "QA76.73.J38 O25 1999",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Java series",
acknowledgement = ack-nhfb,
keywords = "Java (computer program language); threads (computer
author = "Lalit Pant",
title = "Thread Communication In Parallel Algorithms: Enabling
efficient interaction between threads",
journal = j-DDJ,
volume = "24",
number = "4",
pages = "32, 34, 36, 38--39",
month = apr,
year = "1999",
ISSN = "1044-789X",
bibdate = "Wed Mar 3 06:30:11 MST 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/1999/1999_04/parallel.txt",
abstract = "With the increasing availability of multiprocessing
hardware, thread-based parallel algorithms are becoming
more and more important. Lalit presents thread
communication mechanisms for use within parallel
algorithms. Additional resources include parallel.txt
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Thuan Q. Pham and Pankaj K. Garg",
title = "Multithreaded Programming with {Win32}",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
pages = "xix + 219",
year = "1999",
ISBN = "0-13-010912-6",
ISBN-13 = "978-0-13-010912-5",
LCCN = "QA76.642.P518 1998",
bibdate = "Thu Jan 21 18:58:23 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Includes CD-ROM.",
URL = "http://www.phptr.com/ptrbooks/ptr_0130109126.html",
acknowledgement = ack-nhfb,
publishersnote = "If you want to deliver NT applications with maximum
performance, efficiency and robustness, you need to
master multithreading. Multithreaded Programming with
Win32 brings together every Win32 multithreading
technique and concept you must know --- all brilliantly
explained with practical examples and sample code.",
xxnote = "Check pages and year??",
author = "P. J. Plauger",
title = "{Standard C/C++}: a Better Red-Black Tree",
journal = j-CCCUJ,
volume = "17",
number = "7",
pages = "10--??",
month = jul,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:21 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9907/9907toc.htm?topic=articles;
abstract = "The C++ Standard is silent about issues such as thread
safety and DLL safety, but customers and reviewers
certainly aren't.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Etienne Richards",
title = "Adding Level-2 Thread Safety to Existing Objects",
journal = j-CCCUJ,
volume = "17",
number = "2",
pages = "??--??",
month = feb,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:19 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9902/9902toc.htm?topic=articles;
abstract = "The code required to share an object among multiple
threads is tedious and error prone. But it can be
neatly encapsulated.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Jonathan Ringle",
title = "Singleton Creation the Thread-safe Way",
journal = j-CCCUJ,
volume = "17",
number = "10",
pages = "??--??",
month = oct,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:23 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9910/9910toc.htm?topic=articles;
abstract = "Singletons avoid problems with order of construction,
at the cost of more problems for multithreading.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Jeremy B. Rodgers and Rhonda Kay Gaede and Jeffrey H.
title = "{IN-Tune}: an {In-Situ} non-invasive performance
tuning tool for multi-threaded {Linux} on symmetric
multiprocessing {Pentium} workstations",
journal = j-SPE,
volume = "29",
number = "9",
pages = "775--792",
day = "25",
month = jul,
year = "1999",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Thu Jul 29 15:12:27 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=62501865;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
author = "Kevin Roe and Piyush Mehrotra",
title = "Parallelization of a multigrid incompressible viscous
cavity flow solver using {openMP}",
type = "{NASA} contractor report",
number = "NASA\slash CR-1999-209551",
institution = inst-NLRC,
address = inst-NLRC:adr,
pages = "????",
year = "1999",
bibdate = "Thu Mar 16 07:20:02 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Also ICASE report 99-36.",
acknowledgement = ack-nhfb,
author = "Michiel Ronsse and Koen {De Bosschere}",
title = "{RecPlay}: a fully integrated practical record\slash
replay system",
journal = j-TOCS,
volume = "17",
number = "2",
pages = "133--152",
month = may,
year = "1999",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Tue Sep 26 07:54:31 MDT 2000",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
URL = "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p133-ronsse/",
abstract = "This article presents a practical solution for the
cyclic debugging of nondeterministic parallel programs.
The solution consists of a combination of record\slash
replay with automatic on-the-fly data race detection.
This combination enables us to limit the record phase
to the more efficient recording of the synchronization
operations, while deferring the time-consuming data
race detection to the replay phase. As the record phase
is highly efficient, there is no need to switch it off,
hereby eliminating the possibility of Heisenbugs
because tracing can be left on all the time. This
article describes an implementation of the tools needed
to support RecPlay.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
generalterms = "Algorithms; Experimentation; Reliability",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "binary code modification; multithreaded programming;
race detection",
subject = "Software --- Programming Techniques --- Concurrent
Programming (D.1.3): {\bf Parallel programming};
Software --- Software Engineering --- Testing and
Debugging (D.2.5): {\bf Debugging aids}; Software ---
Software Engineering --- Testing and Debugging (D.2.5):
{\bf Monitors}; Software --- Software Engineering ---
Testing and Debugging (D.2.5): {\bf Tracing}; Software
--- Operating Systems --- Process Management (D.4.1):
{\bf Concurrency}; Software --- Operating Systems ---
Process Management (D.4.1): {\bf Deadlocks}; Software
--- Operating Systems --- Process Management (D.4.1):
{\bf Multiprocessing/multiprogramming/multitasking};
Software --- Operating Systems --- Process Management
(D.4.1): {\bf Mutual exclusion}; Software --- Operating
Systems --- Process Management (D.4.1): {\bf
author = "Radu Rugina and Martin Rinard",
title = "Pointer Analysis for Multithreaded Programs",
journal = j-SIGPLAN,
volume = "34",
number = "5",
pages = "77--90",
month = may,
year = "1999",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:03 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html;
note = "See PLDI'99 proceedings \cite{ACM:1999:PASa}.",
URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/301122/p77-rugina/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "H. Saito and N. Stavrakos and C. Polychronopoulos",
title = "Multithreading Runtime Support for Loop and Functional
journal = j-LECT-NOTES-COMP-SCI,
volume = "1615",
pages = "133--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Steven Howard Samorodin",
title = "Supporting flexible safety and sharing in
multi-threaded environments",
type = "Thesis ({M.S.})",
school = "Computer Science Department, University of California,
address = "Davis, CA, USA",
pages = "39",
year = "1999",
bibdate = "Sat Apr 20 11:17:26 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Alex Scherer and Honghui Lu and Thomas Gross and Willy
title = "Transparent adaptive parallelism on {NOWs} using
journal = j-SIGPLAN,
volume = "34",
number = "8",
pages = "96--106",
month = aug,
year = "1999",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:06 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p96-scherer/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Steven S. Seiden",
title = "Randomized Online Multi-Threaded Paging",
journal = j-NORDIC-J-COMPUT,
volume = "6",
number = "2",
pages = "148--??",
month = "Summer",
year = "1999",
ISSN = "1236-6064",
bibdate = "Fri Oct 13 05:25:14 MDT 2000",
bibsource = "http://www.cs.helsinki.fi/njc/njc6.html;
URL = "http://www.cs.helsinki.fi/njc/References/seiden1999:148.html",
acknowledgement = ack-nhfb,
fjournal = "Nordic Journal of Computing",
author = "Kai Shen and Hong Tang and Tao Yang",
title = "Adaptive Two-level Thread Management for Fast {MPI}
Execution on Shared Memory Machines",
crossref = "ACM:1999:SPO",
pages = "??--??",
year = "1999",
bibdate = "Thu Feb 24 09:02:57 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
author = "Balaram Sinharoy",
title = "Compiler optimization to improve data locality for
processor multithreading",
journal = j-SCI-PROG,
volume = "7",
number = "1",
pages = "21--37",
month = "????",
year = "1999",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Thu Mar 28 12:27:27 MST 2002",
bibsource = "Compendex database;
OCLC Article1st database",
URL = "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=64cr5a4mg33tuhcbdr02%26referrer=parent%26backto=issue%2C2%2C7%3Bjournal%2C8%2C9%3Blinkingpublicationresults%2C1%2C1",
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Salvatore Storino and John M. Borkenhagen and Ronald
N. Kalla and Steven R. Kunkel",
title = "A Multi-Threaded 64-bit {PowerPC} Commercial {RISC}
Processor Design",
crossref = "IEEE:1999:HCS",
pages = "??--??",
year = "1999",
bibdate = "Mon Jan 08 05:28:04 2001",
bibsource = "ftp://www.hotchips.org//pub/hotc7to11cd/hc99/hc11_pdf/hc99.s1.1.Storino.txt;
acknowledgement = ack-nhfb,
author = "Herb Sutter",
title = "Optimizations That Aren't (In a Multithreaded World)",
journal = j-CCCUJ,
volume = "17",
number = "6",
pages = "??--??",
month = jun,
year = "1999",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:21 MDT 2002",
bibsource = "http://www.cuj.com/articles/1999/9906/9906toc.htm?topic=articles;
abstract = "An ``obvious'' optimization can really lose ground
when thread safety has to be ensured as well.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi",
title = "Online Feedback for Nested Aggregate Queries with
crossref = "Atkinson:1999:PTF",
pages = "18--29",
year = "1999",
bibdate = "Fri Jan 12 07:50:37 MST 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
http://www.vldb.org/dblp/db/conf/vldb/vldb99.html; OCLC
Proceedings database",
URL = "http://www.vldb.org/dblp/db/conf/vldb/TanGO99.html",
acknowledgement = ack-nhfb,
authorurl = "http://www.vldb.org/dblp/db/indices/a-tree/t/Tan:Kian=Lee.html;
author = "Xinan Tang and Guang R. Gao",
title = "Automatically Partitioning Threads for Multithreaded
journal = j-J-PAR-DIST-COMP,
volume = "58",
number = "2",
pages = "159--189",
month = aug,
year = "1999",
DOI = "https://doi.org/10.1006/jpdc.1999.1551",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Mar 9 09:19:08 MST 2000",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Hong Tang and Kai Shen and Tao Yang",
title = "Compile\slash run-time support for threaded {MPI}
execution on multiprogrammed shared memory machines",
journal = j-SIGPLAN,
volume = "34",
number = "8",
pages = "107--118",
month = aug,
year = "1999",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:06 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p107-tang/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Kenjiro Taura and Kunio Tabata and Akinori Yonezawa",
title = "{StackThreads\slash MP}: integrating futures into
calling standards",
journal = j-SIGPLAN,
volume = "34",
number = "8",
pages = "60--71",
month = aug,
year = "1999",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:06 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p60-taura/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Joe Throop",
title = "Standards: {OpenMP}: Shared-Memory Parallelism from
the Ashes",
journal = j-COMPUTER,
volume = "32",
number = "5",
pages = "108--109",
month = may,
year = "1999",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Thu May 6 06:17:23 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dlib.computer.org/co/books/co1999/pdf/r5108.pdf",
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Marc Torrant and Muhammad Shaaban and Roy Czernikowski
and Ken Hsu",
title = "A simultaneous multithreading simulator",
journal = j-COMP-ARCH-NEWS,
volume = "27",
number = "5",
pages = "1--5",
month = dec,
year = "1999",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "V. Vlassov and A. Kraynikov",
title = "A Queuing Model of a Multi-threaded Architecture: a
Case Study",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1662",
pages = "306--??",
year = "1999",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Sep 13 16:57:02 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "B. Weissman and B. Gomes",
title = "High Performance Thread Migration on Clusters of
volume = "2",
number = "2",
pages = "??--??",
month = "????",
year = "1999",
CODEN = "????",
ISSN = "1097-2803",
bibdate = "Fri Dec 19 08:14:13 MST 2003",
bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2.html;
URL = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2abs.html#boris",
acknowledgement = ack-nhfb,
fjournal = "PDCP: Parallel and Distributed Computing Practices",
author = "C.-C. Wu and C. Chen",
title = "Grouping Memory Consistency Model for
Parallel-Multithreaded Shared-Memory Multiprocessor
volume = "10",
number = "1",
pages = "53--82",
month = mar,
year = "1999",
ISSN = "0129-0533",
bibdate = "Mon Feb 25 11:19:21 MST 2002",
bibsource = "http://ejournals.wspc.com.sg/ijhsc/;
OCLC Article1st database",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Speed Computing
author = "Zhichen Xu and Barton P. Miller and Oscar Naim",
title = "Dynamic instrumentation of threaded applications",
journal = j-SIGPLAN,
volume = "34",
number = "8",
pages = "49--59",
month = aug,
year = "1999",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:06 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/;
URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p49-xu/",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Haitham Akkary and S{\'e}bastien Hily",
title = "The Case for Speculative Multithreading on {SMT}
journal = j-LECT-NOTES-COMP-SCI,
volume = "1940",
pages = "59--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:17:15 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400059.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Anonymous",
title = "New Products: {AVP for Linux/FreeBSD UNIX, Kaspersky
Lab Ltd.; API PowerRAC Chassis 320, Alpha Processor
Inc.; ODBC-ODBC Bridge, Easysoft Ltd.; LinkScan 6.1,
Electronic Software Publishing Corporation; Metro-X
Enhanced Server CD, Metro Link, Inc.; P-STAT
Statistical Software, P-STAT, Inc.; System Manager in a
Box v1.0, PegaSoft Canada; PGI Workstation 3.1, PGI;
Quick Restore 2.6, Workstation Solutions, Inc.;
Threads.h++ and Tools.h++ Professional, Rogue Wave
Software; Scriptics Connect 1.0, 1.1, Scriptics
Corporation; TapeWare 6.2 Backup Software, Yosemite
Technologies, Inc.; DoubleVision for Linux Systems,
Tridia Corporation}",
journal = j-LINUX-J,
volume = "71",
pages = "??--??",
month = mar,
year = "2000",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Thu Sep 21 07:44:12 MDT 2000",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue71/index.html;
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "Anonymous",
title = "Strictly On-Line: {T/TCP: TCP for Transactions by Mark
Stacey, Ivan Griffin and John Nelson; POSIX Thread
Libraries by Felix Garcia and Javier Fernandez; Linux
and Open-Source Applications by Peter Jones and M. B.
Jorgenson; Laptops for Linux! by Jason Kroll}",
journal = j-LINUX-J,
volume = "70",
pages = "??--??",
month = feb,
year = "2000",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Thu Sep 21 16:32:31 MDT 2000",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html;
URL = "http://noframes.linuxjournal.com/lj-issues/issue70/3075.html;
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "G. Antoniu and L. Boug{\'e} and R. Namyst and C.
title = "Compiling Data-Parallel Programs to a Distributed
Runtime Environment with Thread Isomigration",
volume = "10",
number = "2/3",
pages = "201--??",
month = sep,
year = "2000",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Wed Apr 18 07:29:37 2001",
bibsource = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S01296264001002_03.html;
URL = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S0129626400000202.html",
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher
and Mark MacBeth and Keith McGuigan and Raymond
title = "Implementing {Java} Consistency Using a Generic,
Multithreaded {DSM} Runtime System",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "560--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000560.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Olivier Aumage and Luc Boug{\'e} and Raymond Namyst",
title = "A Portable and Adaptative Multi-protocol Communication
Library for Multithreaded Runtime Systems",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "1136--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001136.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Pete Becker",
title = "The Journeyman's Shop: Unraveling Multithreading",
journal = j-CCCUJ,
volume = "18",
number = "8",
pages = "71--??",
month = aug,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:27 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0008/0008toc.htm?topic=articles;
abstract = "Sometimes you have to spend a lot of time on just a
little bit of code, to avoid spending much more time
not knowing where to begin debugging.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Michael Bedy and Steve Carr and Xianlong Huang and
Ching-Kuang Shene",
title = "A visualization system for multithreaded programming",
journal = j-SIGCSE,
volume = "32",
number = "1",
pages = "1--5",
month = mar,
year = "2000",
DOI = "https://doi.org/10.1145/331795.331798",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Mon Nov 19 10:05:03 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Emery D. Berger and Kathryn S. McKinley and Robert D.
Blumofe and Paul R. Wilson",
title = "{Hoard}: a scalable memory allocator for multithreaded
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "5",
pages = "117--128",
month = dec,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Emery D. Berger and Kathryn S. McKinley and Robert D.
Blumofe and Paul R. Wilson",
title = "{Hoard}: a Scalable Memory Allocator for
Multithreaded Applications",
journal = j-SIGPLAN,
volume = "35",
number = "11",
pages = "117--128",
month = nov,
year = "2000",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:19 MST 2003",
bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Emery D. Berger and Kathryn S. McKinley and Robert D.
Blumofe and Paul R. Wilson",
title = "{Hoard}: a scalable memory allocator for multithreaded
journal = j-OPER-SYS-REV,
volume = "34",
number = "5",
pages = "117--128",
month = dec,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "K. E. Beyls and E. H. D'Hollander",
title = "Compiler Generated Multithreading to Alleviate Memory
journal = j-J-UCS,
volume = "6",
number = "10",
pages = "968--993",
day = "28",
month = oct,
year = "2000",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Feb 20 07:23:07 MST 2002",
bibsource = "http://www.jucs.org/jucs;
URL = "http://www.jucs.org/jucs_6_10/compiler_generated_multithreading_to",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "Suchendra M. Bhandarkar and Shankar R.
title = "Parallel Parsing of {MPEG} Video in a Multi-threaded
Multiprocessor Environment",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "194--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000194.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Barry Bolding and Kim Baldridge",
title = "Multithreaded shared memory parallel implementation of
the electronic structure code {GAMESS}",
journal = j-COMP-PHYS-COMM,
volume = "128",
number = "1--2",
pages = "55--66",
day = "9",
month = jun,
year = "2000",
DOI = "https://doi.org/10.1016/S0010-4655(00)00067-9",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Mon Feb 13 23:40:43 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465500000679",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "J. M. Borkenhagen and R. J. Eickemeyer and R. N. Kalla
and S. R. Kunkel",
title = "A multithreaded {PowerPC} processor for commercial
journal = j-IBM-JRD,
volume = "44",
number = "6",
pages = "885--898",
month = nov,
year = "2000",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Sat Feb 24 09:44:45 MST 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/446/borkenhagen.html",
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
ordernumber = "G322-0224",
author = "Fr{\'e}d{\'e}ric Boussinot and Jean-Ferdy Susini",
title = "{Java} threads and {SugarCubes}",
journal = j-SPE,
volume = "30",
number = "5",
pages = "545--566",
day = "25",
month = apr,
year = "2000",
DOI = "https://doi.org/10.1002/(SICI)1097-024X(20000425)30:5<545::AID-SPE308>3.0.CO;2-Q",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Tue Mar 13 06:45:44 2001",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/71004433/START;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
author = "Steve W. Bova and Clay P. Breshears and Christine E.
Cuicchi and Zeki Demirbilek and Henry A. Gabb",
title = "Dual-Level Parallel Analysis of Harbor Wave Response
Using {MPI} and {OpenMP}",
journal = j-IJHPCA,
volume = "14",
number = "1",
pages = "49--64",
month = "Spring",
year = "2000",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Tue Sep 12 12:39:11 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
journal-URL = "http://hpc.sagepub.com/content/by/year",
author = "Margaret Cahir and Robert Moench and Alice E.
title = "Programming Models and Methods",
crossref = "Koniges:2000:ISP",
chapter = "3",
pages = "27--54",
year = "2000",
bibdate = "Fri Feb 04 18:32:51 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Discusses PVM, MPI, SHMEM, High-Performance Fortran,
and POSIX threads.",
acknowledgement = ack-nhfb,
author = "Brendon Cahoon and Kathryn S. McKinley and Zhihong
title = "Evaluating the performance of distributed
architectures for information retrieval using a variety
of workloads",
journal = j-TOIS,
volume = "18",
number = "1",
pages = "1--43",
month = jan,
year = "2000",
ISSN = "1046-8188",
ISSN-L = "0734-2047",
bibdate = "Tue Sep 26 09:34:01 MDT 2000",
bibsource = "http://www.acm.org/pubs/contents/journals/tois/;
URL = "http://www.acm.org/pubs/citations/journals/tois/2000-18-1/p1-cahoon/",
abstract = "The information explosion across the Internet and
elsewhere offers access to an increasing number of
document collections. In order for users to effectively
access these collections, information retrieval (IR)
systems must provide coordinated, concurrent, and
distributed access. In this article, we explore how to
achieve scalable performance in a distributed system
for collection sizes ranging from 1GB to 128GB. We
implement a fully functional distributed IR system
based on a multithreaded version of the Inquery
simulation model. We measure performance as a function
of system parameters such as client command rate,
number of document collections, ter ms per query, query
term frequency, number of answers returned, and command
mixture. Our results show that it is important to model
both query and document commands because the
heterogeneity of commands significantly impacts
performance. Based on our results, we recommend simple
changes to the prototype and evaluate the changes using
the simulator. Because of the significant resource
demands of information retrieval, it is not difficult
to generate workloads that overwhelm system resources
regardless of the architecture. However under some
realistic workloads, we demonstrate system
organizations for which response time gracefully
degrades as the workload increases and performance
scales with the number of processors. This scalable
architecture includes a surprisingly small number of
brokers through which a large number of clients and
servers communicate.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Information Systems",
keywords = "distributed information retrieval architectures",
subject = "Computer Systems Organization ---
Computer-Communication Networks --- Distributed Systems
(C.2.4); Computer Systems Organization --- Performance
of Systems (C.4); Computer Systems Organization ---
Performance of Systems (C.4): {\bf Performance
attributes}; Information Systems --- Information
Storage and Retrieval --- Systems and Software
author = "Charles Calkins",
title = "Integrating Threads with Template Classes",
journal = j-CCCUJ,
volume = "18",
number = "5",
pages = "32--??",
month = may,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:26 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
abstract = "It's obviously a good idea to encapsulate a thread as
an object. It is less obvious how to get all the
interfaces right.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Steve Carr and Ching-Kuang Shene",
title = "A portable class library for teaching multithreaded
journal = j-SIGCSE,
volume = "32",
number = "3",
pages = "124--127",
month = sep,
year = "2000",
DOI = "https://doi.org/10.1145/353519.343138",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:56:43 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "J. {Chassin de Kergommeaux} and B. Stein and P. E.
title = "{Paj{\'e}}, an interactive visualization tool for
tuning multi-threaded parallel applications",
volume = "26",
number = "10",
pages = "1253--1274",
day = "15",
month = aug,
year = "2000",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Sat Oct 28 17:44:14 MDT 2000",
bibsource = "http://www.elsevier.com/locate/issn/01678191;
URL = "http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/abstract.html;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Sung-Eun Choi and E. Christopher Lewis",
title = "A study of common pitfalls in simple multi-threaded
journal = j-SIGCSE,
volume = "32",
number = "1",
pages = "325--329",
month = mar,
year = "2000",
DOI = "https://doi.org/10.1145/331795.331879",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Mon Nov 19 10:05:03 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "It is generally acknowledged that developing correct
multi-threaded codes is difficult, because threads may
interact with each other in unpredictable ways. The
goal of this work is to discover common multi-threaded
programming pitfalls, the knowledge of which will be
useful in instructing new programmers and in developing
tools to aid in multi-threaded programming. To this
end, we study multi-threaded applications written by
students from introductory operating systems courses.
Although the applications are simple, careful
inspection and the use of an automatic race detection
tool reveal a surprising quantity and variety of
synchronization errors. We describe and discuss these
errors, evaluate the role of automated tools, and
propose new tools for use in the instruction of
multi-threaded programming.",
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Thomas Christopher and George Thiruvathukal",
title = "High Performance {Java} Platform Computing:
Multithreaded and Networked Programming",
publisher = pub-PH,
address = pub-PH:adr,
pages = "xxii + 409",
year = "2000",
ISBN = "0-13-016164-0",
ISBN-13 = "978-0-13-016164-2",
LCCN = "????",
bibdate = "Tue Feb 20 18:03:50 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$49.99",
URL = "http://www.sun.com/books/catalog/christopher/",
acknowledgement = ack-nhfb,
author = "James C. Corbett",
title = "Using shape analysis to reduce finite-state models of
concurrent {Java} programs",
journal = j-TOSEM,
volume = "9",
number = "1",
pages = "51--93",
month = jan,
year = "2000",
ISSN = "1049-331X (print), 1557-7392 (electronic)",
ISSN-L = "1049-331X",
bibdate = "Fri Apr 20 08:21:35 MDT 2001",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/articles/journals/tosem/2000-9-1/p51-corbett/p51-corbett.pdf;
abstract = "Finite-state verification (e.g., model checking)
provides a powerful means to detect concurrency errors,
which are often subtle and difficult to reproduce.
Nevertheless, widespread use of this technology by
developers is unlikely until tools provide automated
support for extracting the required finite-state models
directly from program source. Unfortunately, the
dynamic features of modern languages such as Java
complicate the construction of compact finite-state
models for verification. In this article, we show how
shape analysis, which has traditionally been used for
computing alias information in optimizers, can be used
to greatly reduce the size of finite-state models of
concurrent Java programs by determining which
heap-allocated variables are accessible only by a
single thread, and which shared variables are protected
by locks. We also provide several other state-space
reductions based on the semantics of Java monitors. A
prototype of the reductions demonstrates their
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Software Engineering and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790",
keywords = "concurrent systems; finite-state verification; Java;
model extraction; modeling; shape analysis; state-space
subject = "Software --- Software Engineering --- Software/Program
Verification (D.2.4)",
author = "J. Cui and J. L. Bordim and K. Nakano and T. Hayashi
and N. Ishii",
title = "Multithreaded Parallel Computer Model with Performance
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "155--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000155.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Vincent Danjean and Raymond Namyst and Robert D.
title = "Integrating Kernel Activations in a Multithreaded
Runtime System on Top of {L} {INUX}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "1160--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001160.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "David Dill",
title = "Model checking {Java} programs (abstract only)",
journal = j-SIGSOFT,
volume = "25",
number = "5",
pages = "179",
month = sep,
year = "2000",
DOI = "https://doi.org/10.1145/347636.349113",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:14:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
abstract = "Automatic state exploration tools (model checkers)
have had some success when applied to protocols and
hardware designs, but there are fewer success stories
about software. This is unfortunate, since the software
problem is worsening even faster than the hardware and
protocol problems. Model checking of concurrent
programs is especially interesting, because they are
notoriously difficult to test, analyze, and debug by
other methods. This talk will be a description of our
initial efforts to check Java programs using a model
checker. The model checker supports dynamic allocation,
thread creation, and recursive procedures (features
that are not necessary for hardware verification), and
has some special optimizations and checks tailored to
multi-threaded Java program. I will also discuss some
of the challenges for future efforts in this area.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Kenneth J. Duda and David R. Cheriton",
title = "Borrowed-virtual-time {(BVT)} scheduling: supporting
latency-sensitive threads in a general-purpose
journal = j-OPER-SYS-REV,
volume = "34",
number = "2",
pages = "27--28",
month = apr,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:42 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Ralf S. Engelschall",
title = "Portable Multithreading --- The Signal Stack Trick for
User-Space Thread Creation",
crossref = "USENIX:2000:UAT",
pages = "239--249",
year = "2000",
bibdate = "Tue Oct 15 09:53:32 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.usenix.org/events/usenix2000/general/engelschall.html",
acknowledgement = ack-nhfb,
author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve
Reinhardt and Trevor Mudge",
title = "Thread-level parallelism and interactive performance
of desktop applications",
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "5",
pages = "129--138",
month = dec,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kriszti{\'a}n Flautner and Rich Uhlig and Steve
Reinhardt and Trevor Mudge",
title = "Thread Level Parallelism and Interactive Performance
of Desktop Applications",
journal = j-SIGPLAN,
volume = "35",
number = "11",
pages = "129--138",
month = nov,
year = "2000",
DOI = "https://doi.org/10.1145.357001",
ISBN = "1-58113-317-0",
ISBN-13 = "978-1-58113-317-2",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:19 MST 2003",
bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html;
URL = "http://delivery.acm.org/10.1145/360000/357001/p129-flautner.pdf",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "IA-64",
author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve
Reinhardt and Trevor Mudge",
title = "Thread-level parallelism and interactive performance
of desktop applications",
journal = j-OPER-SYS-REV,
volume = "34",
number = "5",
pages = "129--138",
month = dec,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Felix Garcia and Javier Fernandez",
title = "{POSIX} Thread Libraries",
journal = j-LINUX-J,
volume = "70",
pages = "??--??",
month = feb,
year = "2000",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Thu Sep 21 16:46:44 MDT 2000",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html;
URL = "http://noframes.linuxjournal.com/lj-issues/issue/3184.html",
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "L. Geppert",
title = "Microprocessors: the off-beat generation",
journal = j-IEEE-SPECTRUM,
volume = "37",
number = "7",
pages = "44--49",
month = jul,
year = "2000",
DOI = "https://doi.org/10.1109/6.852051",
ISSN = "0018-9235 (print), 1939-9340 (electronic)",
ISSN-L = "0018-9235",
bibdate = "Sat Jan 18 12:29:46 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeespectrum2000.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Spectrum",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6",
keywords = "Biology computing; Bonding; Broadband communication;
broadband networks; Electronics industry;
microprocessor chips; microprocessors; Microprocessors;
multimedia broadband communications; multimedia
communication; multimedia computing; Multithreading;
off-beat generation; performance; Personal
communication networks; programmable controllers;
programmable logic; Real time systems; Supercomputers;
supercomputing; Workstations",
author = "Alex Gontmakher and Assaf Schuster",
title = "{Java} consistency: nonoperational characterizations
for {Java} memory behavior",
journal = j-TOCS,
volume = "18",
number = "4",
pages = "333--386",
year = "2000",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jul 18 10:18:45 MDT 2001",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p333-gontmakher/p333-gontmakher.pdf;
abstract = "The Java Language Specification (JLS) [Gosling et al.
1996] provides an operational definition for the
consistency of shared variables. The definition remains
unchanged in the JLS 2nd edition, currently under peer
review, which relies on a specific abstract machine as
its underlying model, is very complicated. Several
subsequent works have tried to simplify and formalize
it. However, these revised definitions are also
operational, and thus have failed to highlight the
intuition behind the original specification. In this
work we provide a complete nonoperational specification
for Java and for the JVM, excluding synchronized
operations. We provide a simpler definition, in which
we clearly distinguish the consistency model that is
promised to the programmer from that which should be
implemented in the JVM. This distinction, which was
implicit in the original definition, is crucial for
building the JVM. We find that the programmer model is
strictly weaker than that of the JVM, and precisely
define their discrepancy. Moreover, our definition is
independent of any specific (or even abstract) machine,
and can thus be used to verify JVM implementations and
compiler optimizations on any platform. Finally, we
show the precise range of consistency relaxations
obtainable for the Java memory model when a certain
compiler optimization-- called {\em prescient stores\/}
in JLS--is applicable.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
generalterms = "Verification",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "Java memory models; multithreading; nonoperational
subject = "Hardware --- Memory Structures --- Performance
Analysis and Design Aids** (B.3.3): {\bf Formal
author = "K. Gopinath and M. K. Krishna Narasimhan",
title = "Performance of Switch Blocking on Multithreaded
journal = j-J-UCS,
volume = "6",
number = "10",
pages = "928--947",
day = "28",
month = oct,
year = "2000",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Feb 20 07:23:07 MST 2002",
bibsource = "http://www.jucs.org/jucs;
URL = "http://www.jucs.org/jucs_6_10/performance_of_switch_blocking",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "Allen I. Holub",
title = "Taming {Java} Threads",
publisher = pub-APRESS,
address = pub-APRESS:adr,
pages = "x + 300",
year = "2000",
ISBN = "1-893115-10-0",
ISBN-13 = "978-1-893115-10-1",
LCCN = "QA76.73.J38 H635 2000",
bibdate = "Fri May 10 12:18:17 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
price = "US\$34.95",
acknowledgement = ack-nhfb,
keywords = "Java (computer program language); threads (computer
author = "Peter Horwood and Shlomo Wygodny and Martin Zardecki",
title = "Debugging Multithreaded Applications",
journal = j-DDJ,
volume = "25",
number = "3",
pages = "32, 34--37",
month = mar,
year = "2000",
ISSN = "1044-789X",
bibdate = "Thu Nov 9 08:25:14 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2000/2000_03/dbgmulti.txt",
abstract = "It is often significantly harder to locate and test
for bugs in multithreaded and multiprocess applications
than for nonthreaded, single process situations. Our
authors describe some of the problems with
multithreaded applications and discuss common debugging
techniques. Additional resources include dbgmulti.txt
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "David M. Howard",
title = "Using Predicate Waits with {Win32} Threads",
journal = j-CCCUJ,
volume = "18",
number = "5",
pages = "18--??",
month = may,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:26 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
abstract = "Most Win32 synchronization primitives are just that
--- primitive. But you can use them to build queues
that are safe and easy to use.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Paul Hyde",
title = "{Java} thread programming",
publisher = pub-SAMS,
address = pub-SAMS:adr,
pages = "iv + 510",
year = "2000",
ISBN = "0-672-31585-8",
ISBN-13 = "978-0-672-31585-5",
LCCN = "QA76.73.J38 H93 1999",
bibdate = "Wed Feb 21 06:02:14 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "J. Keller and T. Ungerer",
title = "{J.UCS} Special Issue on Multithreaded Processors and
journal = j-J-UCS,
volume = "6",
number = "10",
pages = "906--907",
day = "28",
month = oct,
year = "2000",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Feb 20 07:23:07 MST 2002",
bibsource = "http://www.jucs.org/jucs;
URL = "http://www.jucs.org/jucs_6_10/j_ucs_special_issue",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "Jeff Kleber",
title = "Thread-Safe Access to Collections",
journal = j-CCCUJ,
volume = "18",
number = "5",
pages = "36--??",
month = may,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:26 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
abstract = "The best place to store a thread lock for a shared
container is somewhere inside the container --- deep
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "David Lafreniere",
title = "State Machine Design in {C++}",
journal = j-CCCUJ,
volume = "18",
number = "5",
pages = "58--??",
month = may,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:26 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles;
abstract = "It's not all that hard to implement a finite-state
machine, unless it's very large, and you have to worry
about multithreading, and \ldots{}.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Bil Lewis and Daniel J. Berg",
title = "Multithreaded Programming with {Java} Technology",
address = pub-SUN-MICROSYSTEMS-PRESS:adr,
pages = "xxv + 461",
year = "2000",
ISBN = "0-13-017007-0",
ISBN-13 = "978-0-13-017007-1",
LCCN = "QA76.73.J38 L488 2000",
bibdate = "Fri Apr 11 15:58:52 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$39.99",
series = "Sun BluePrints Program",
URL = "http://www.sun.com/books/catalog/lewis3/index.html",
acknowledgement = ack-nhfb,
author = "Yibei Ling and Tracy Mullen and Xiaola Lin",
title = "Analysis of optimal thread pool size",
journal = j-OPER-SYS-REV,
volume = "34",
number = "2",
pages = "42--55",
month = apr,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:42 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Juval Lowy",
title = "Making Primitive Objects Thread Safe",
journal = j-CCCUJ,
volume = "18",
number = "3",
pages = "85--??",
month = mar,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:25 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0003/0003toc.htm?topic=articles;
abstract = "All sorts of things need thread locks. A fairly simple
template or two can do the job.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Satoshi Matsushita and Sunao Torii and Masahiko Nomura
and Toshiaki Inoue and Atsufumi Shibayama and Sachiko
Shimada and Taku Osawa and Hiroaki Inoue and Kouichiro
Minami and Junji Sakai and Yoshiyuki Ito and Yuichi
Nakamura and Masato Edahiro and Naoki Nishi and
Masakazu Yamashina",
title = "{Merlot}: a Single-Chip Tightly Coupled Four-Way
Multi-Thread Processor",
crossref = "Anonymous:2000:CCI",
pages = "??--??",
year = "2000",
bibdate = "Mon Jan 08 05:28:04 2001",
bibsource = "http://www.coolchips.org/index-cool3.html;
abstract = "We developed an on-chip four-way multiprocessor, MP98
version 1, code-named Merlot. It is fabricated with a
0.15 $ \mu $ m process and has a die size of 110 mm2.
Merlot is a high performance embedded processor for
intelligent appliances. We extract a higher degree of
parallelism with low voltage operation. In our
presentation, we describe our multi-threading model.
Then, we explain Merlot's pipeline architecture,
focusing on fast thread creation and memory renaming.
We also describe our on-chip SDRAM interface which has
a throughput greater than 1 GB/sec and cache miss
penalty less than 100 ns. Finally, we show a
performance estimation for speech recognition and MPEG2
code, power dissipation, and average memory latency.
Restructured speech recognition code was compiled with
directives, and IPC of 2.72 is estimated.",
acknowledgement = ack-nhfb,
author = "A. Metzner and J. Niehaus",
title = "{MSparc}: Multithreading in Real-Time Architectures",
journal = j-J-UCS,
volume = "6",
number = "10",
pages = "1034--1051",
day = "28",
month = oct,
year = "2000",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Feb 20 07:23:07 MST 2002",
bibsource = "http://www.jucs.org/jucs;
URL = "http://www.jucs.org/jucs_6_10/msparc_multithreading_in_real",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "A. S. Mohamed and A. Galal and I. Khalil and K. Sobh
and M. Selim",
title = "{Dispo}: Distributed Multi-Threaded Execution of
{Prolog} Programs",
journal = j-INT-J-COMPUT-APPL,
volume = "22",
number = "2",
pages = "100--108",
year = "2000",
DOI = "https://doi.org/10.1080/1206212X.2000.11441606",
ISSN = "1206-212X (print), 1925-7074 (electronic)",
ISSN-L = "1206-212X",
bibdate = "Sat Apr 21 17:19:15 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ijca.bib;
URL = "https://www.tandfonline.com/doi/full/10.1080/1206212X.2000.11441606",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Computers and Applications",
journal-URL = "https://www.tandfonline.com/loi/tjca20",
online-date = "10 Jul 2015",
author = "John Mount",
title = "Automatic Detection Of Potential Deadlock",
journal = j-DDJ,
volume = "25",
number = "12",
pages = "64, 66--70, 72",
month = dec,
year = "2000",
ISSN = "1044-789X",
bibdate = "Wed Nov 8 15:09:25 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2000/2000_12/deadlock.txt;
abstract = "Deadlock can occur when a number of consumers
(typically threads) access a set of resources in an
unacceptable pattern. To combat it, John presents a
solution based on run-time lock analysis that analyzes
all transactions. Additional resources include
deadlock.txt (listings) and deadlock.zip (source
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Zsolt N{\'e}meth",
title = "Abstract machine design on a multithreaded
journal = j-FUT-GEN-COMP-SYS,
volume = "16",
number = "6",
pages = "705--716",
month = apr,
year = "2000",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Wed Feb 27 12:41:20 MST 2002",
bibsource = "http://www.elsevier.com/locate/issn/0167739X;
URL = "http://www.elsevier.com/gej-ng/10/19/19/41/29/36/abstract.html",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Ida M. B. Nielsen and Curtis L. Janssen",
title = "Multi-threading: a new dimension to massively parallel
scientific computation",
journal = j-COMP-PHYS-COMM,
volume = "128",
number = "1--2",
pages = "238--244",
day = "9",
month = jun,
year = "2000",
DOI = "https://doi.org/10.1016/S0010-4655(00)00062-X",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Mon Feb 13 23:40:43 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S001046550000062X",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "Yoshihiro Oyama and Kenjiro Taura and Akinori
title = "Online Computation of Critical Paths for Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "301--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000301.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Mark Peterson",
title = "{C/C++} Tips: Tip \#4: Self Destructing Threads",
journal = j-CCCUJ,
volume = "18",
number = "12",
pages = "44--??",
month = dec,
year = "2000",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:29 MDT 2002",
bibsource = "http://www.cuj.com/articles/2000/0012/0012toc.htm?topic=articles;
abstract = "A way to make threads easier to manage.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Ivan Pulleyn",
title = "Embedding {Python} in Multi-Threaded {C\slash C++}
journal = j-LINUX-J,
volume = "73",
pages = "??--??",
month = may,
year = "2000",
ISSN = "1075-3583 (print), 1938-3827 (electronic)",
ISSN-L = "1075-3583",
bibdate = "Thu Sep 21 07:44:12 MDT 2000",
bibsource = "http://noframes.linuxjournal.com/lj-issues/issue73/index.html;
abstract = "Python provides a clean intuitive interface to
complex,threaded applications.",
acknowledgement = ack-nhfb,
fjournal = "Linux journal",
journal-URL = "http://portal.acm.org/citation.cfm?id=J508",
author = "M. Ranganathan and Mark Bednarek and Fernand Pors and
Doug Montgomery",
title = "{AGNI}: a Multi-threaded Middleware for Distributed
crossref = "USENIX:2000:PUT",
pages = "??--??",
year = "2000",
bibdate = "Wed Oct 16 05:17:16 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/ranganathan.html",
acknowledgement = ack-nhfb,
author = "Joshua A. Redstone and Susan J. Eggers and Henry M.
title = "An analysis of operating system behavior on a
simultaneous multithreaded architecture",
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "5",
pages = "245--256",
month = dec,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Joshua A. Redstone and Susan J. Eggers and Henry M.
title = "An Analysis of Operating System Behavior on a
Simultaneous Multithreaded Architecture",
journal = j-SIGPLAN,
volume = "35",
number = "11",
pages = "245--256",
month = nov,
year = "2000",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:19 MST 2003",
bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Joshua A. Redstone and Susan J. Eggers and Henry M.
title = "An analysis of operating system behavior on a
simultaneous multithreaded architecture",
journal = j-OPER-SYS-REV,
volume = "34",
number = "5",
pages = "245--256",
month = dec,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Steven K. Reinhardt and Shubhendu S. Mukherjee",
title = "Transient fault detection via simultaneous
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "2",
pages = "25--36",
month = may,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:49 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Steven H. Samorodin and Raju Pandey",
title = "Supporting Flexible Safety and Sharing in
Multi-threaded Environments",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1800",
pages = "1184--??",
year = "2000",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 09:16:18 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001184.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Yasushi Shinjo and Calton Pu",
title = "Developing correct and efficient multithreaded
programs with thread-specific data and a partial
journal = j-OPER-SYS-REV,
volume = "34",
number = "2",
pages = "33--33",
month = apr,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:42 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Yasushi Shinjo",
title = "Developing correct and efficient multithreaded
programs with thread-specific data and a partial
journal = j-OPER-SYS-REV,
volume = "34",
number = "2",
pages = "40--40",
month = apr,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:42 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Allan Snavely and Dean M. Tullsen",
title = "Symbiotic job scheduling for a simultaneous
multithreaded processor",
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "5",
pages = "234--244",
month = dec,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Allan Snavely and Dean M. Tullsen",
title = "Symbiotic Jobscheduling for a Simultaneous
Multithreading Processor",
journal = j-SIGPLAN,
volume = "35",
number = "11",
pages = "234--244",
month = nov,
year = "2000",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:19 MST 2003",
bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Allan Snavely and Dean M. Tullsen",
title = "Symbiotic jobscheduling for a simultaneous
multithreaded processor",
journal = j-OPER-SYS-REV,
volume = "34",
number = "5",
pages = "234--244",
month = dec,
year = "2000",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "J. Greggory Steffan and Christopher B. Colohan and
Antonia Zhai and Todd C. Mowry",
title = "A scalable approach to thread-level speculation",
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "2",
pages = "1--12",
month = may,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:49 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi",
title = "Progressive evaluation of nested aggregate queries",
journal = j-VLDB-J,
volume = "9",
number = "3",
pages = "261--278",
month = dec,
year = "2000",
ISSN = "1066-8888 (print), 0949-877X (electronic)",
ISSN-L = "1066-8888",
bibdate = "Mon Jun 23 10:50:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In many decision-making scenarios, decision makers
require rapid feedback to their queries, which
typically involve aggregates. The traditional {\em
blocking execution model\/} can no longer meet the
demands of these users. One promising approach in the
literature, called {\em online aggregation}, evaluates
an aggregation query progressively as follows: as soon
as certain data have been evaluated, approximate
answers are produced with their respective running
confidence intervals; as more data are examined, the
answers and their corresponding running confidence
intervals are refined. In this paper, we extend this
approach to handle nested queries with aggregates
(i.e., at least one inner query block is an aggregate
query) by providing users with (approximate) answers
progressively as the inner aggregation query blocks are
evaluated. We address the new issues pose by nested
queries. In particular, the answer space begins with a
superset of the final answers and is refined as the
aggregates from the inner query blocks are refined. For
the intermediary answers to be meaningful, they have to
be interpreted with the aggregates from the inner
queries. We also propose a {\em multi-threaded model\/}
in evaluating such queries: each query block is
assigned to a thread, and the threads can be evaluated
concurrently and independently. The time slice across
the threads is {\em nondeterministic\/} in the sense
that the user controls the relative rate at which these
subqueries are being evaluated. For {\em enumerative\/}
nested queries, we propose a priority-based evaluation
strategy to present answers that are certainly in the
final answer space first, before presenting those whose
validity may be affected as the inner query aggregates
are refined. We implemented a prototype system using
Java and evaluated our system. Results for nested
queries with a level and multiple levels of nesting are
reported. Our results show the effectiveness of the
proposed mechanisms in providing progressive feedback
that reduces the initial waiting time of users
significantly without sacrificing the quality of the
acknowledgement = ack-nhfb,
fjournal = "VLDB Journal: Very Large Data Bases",
journal-URL = "http://portal.acm.org/toc.cfm?id=J869",
keywords = "approximate answers; multi-threading; nested aggregate
queries; online aggregation; progressive query
author = "Hong Tang and Kai Shen and Tao Yang",
title = "Program transformation and runtime support for
threaded {MPI} execution on shared-memory machines",
journal = j-TOPLAS,
volume = "22",
number = "4",
pages = "673--700",
year = "2000",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Apr 17 10:05:24 MDT 2001",
bibsource = "http://www.acm.org/pubs/toc/;
URL = "http://www.acm.org/pubs/citations/journals/toplas/2000-22-4/p673-tang/",
abstract = "Parallel programs written in MPI have been widely used
for developing high-performance applications on various
platforms. Because of a restriction of the MPI
computation model, conventional MPI implementations on
shared-memory machines map each MPI node to an OS
process, which can suffer serious performance
degradation in the presence of multiprogramming. This
paper studies compile-time and runtime techniques for
enhancing performance portability of MPI code running
on multiprogrammed shared-memory machines. The proposed
techniques allow MPI nodes to be executed safety and
efficiently as threads. Compile-time transformation
eliminates global and static variables in C code using
node-specific data. The runtime support includes an
efficient and provably correct communication protocol
that uses lock-free data structure and takes advantage
of address space sharing among threads. The experiments
on SGI Origin 2000 show that our MPI prototype called
TMPI using the proposed techniques is competitive with
SGI's native MPI implementation in a dedicated
environment, and that it has significant performance
advantages in a multiprogrammed environment.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
generalterms = "Algorithms; Design; Experimentation; Languages;
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "lock-free synchronization; MPI; multiprogrammed
environments; program transformation; shared-memory
machines; threaded execution",
subject = "Hardware --- Memory Structures --- Design Styles
(B.3.2): {\bf Shared memory}; Software --- Programming
Techniques --- Concurrent Programming (D.1.3): {\bf
Parallel programming}; Software --- Programming
Languages --- Language Classifications (D.3.2): {\bf
Concurrent, distributed, and parallel languages};
Software --- Programming Languages --- Processors
(D.3.4): {\bf Preprocessors}; Software --- Programming
Languages --- Processors (D.3.4): {\bf Run-time
environments}; Software --- Operating Systems ---
Process Management (D.4.1): {\bf
Multiprocessing/multiprogramming/multitasking}; Data
--- Data Structures (E.1): {\bf Lists, stacks, and
author = "Kevin B. Theobald and Gagan Agrawal and Rishi Kumar
and Gerd Heber and Guang R. Gao and Paul Stodghill and
Keshav Pingali",
title = "Landing {CG} on {EARTH}: a Case Study of
Fine-Grained Multithreading on an Evolutionary Path",
crossref = "ACM:2000:SHP",
pages = "47--47",
year = "2000",
bibdate = "Mon Feb 12 11:57:42 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc2000.org/proceedings/techpapr/papers/pap293.pdf",
acknowledgement = ack-nhfb,
author = "A. Unger and E. Zehendner and Th. Ungerer",
title = "A combined compiler and architecture technique to
control multithreaded execution of branches and loop
journal = j-COMP-ARCH-NEWS,
volume = "28",
number = "1",
pages = "53--61",
month = mar,
year = "2000",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Andrej Vckovski and Jason Brazile",
title = "A Multi-Threaded Server for Shared Hash Table Access",
crossref = "USENIX:2000:PUT",
pages = "??--??",
year = "2000",
bibdate = "Wed Oct 16 05:17:16 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/vckovski.html",
acknowledgement = ack-nhfb,
author = "Dascal Vishkin and Uzi Vishkin",
title = "Experiments with list ranking for explicit
multi-threaded {(XMT)} instruction parallelism",
volume = "5",
pages = "10:1--10:??",
month = "????",
year = "2000",
CODEN = "????",
DOI = "https://doi.org/10.1145/351827.384252",
ISSN = "1084-6654",
bibdate = "Mon Oct 6 16:03:09 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Algorithms for the problem of list ranking are
empirically studied with respect to the Explicit
Multi-Threaded (XMT) platform for instruction-level
parallelism (ILP). The main goal of this study is to
understand the differences between XMT and more
traditional parallel computing implementation
platforms/models as they pertain to the well studied
list ranking problem. The main two findings are: (i)
good speedups for much smaller inputs are possible and
(ii) in part, the first finding is based on a new
variant of a 1984 algorithm, called the No-Cut
algorithm. The paper incorporates analytic
(non-asymptotic) performance analysis into experimental
performance analysis for relatively small inputs. This
provides an interesting example where experimental
research and theoretical analysis complement one
another. Explicit Multi-Threading (XMT) is a
fine-grained computation framework introduced in our
SPAA'98 paper. Building on some key ideas of parallel
computing, XMT covers the spectrum from algorithms
through architecture to implementation; the main
implementation related innovation in XMT was through
the incorporation of low-overhead hardware and software
mechanisms (for more effective fine-grained
parallelism). The reader is referred to that paper for
detail on these mechanisms. The XMT platform aims at
faster single-task completion time by way of ILP.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "Journal of Experimental Algorithmics (JEA)",
author = "Mark Walmsley",
title = "Multi-threaded programming in {C++}",
publisher = pub-SV,
address = pub-SV:adr,
pages = "x + 223",
year = "2000",
ISBN = "1-85233-146-1",
ISBN-13 = "978-1-85233-146-7",
LCCN = "QA76.73.C153 W3148 2000",
bibdate = "Sat Apr 20 11:14:00 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
price = "US\$49.95",
acknowledgement = ack-nhfb,
author = "Gregory V. Wilson",
title = "Programmer's Bookshelf: Classics Old and New",
journal = j-DDJ,
volume = "25",
number = "11",
pages = "159--160",
month = nov,
year = "2000",
ISSN = "1044-789X",
bibdate = "Wed Nov 8 15:09:25 MST 2000",
bibsource = "http://www.ddj.com/;
abstract = "This month Greg looks at Programming Pearls, Second
Edition, by Jon Bentley; Foundations of Multithreaded,
Parallel, and Distributing Programming, by Gregory R.
Andrews; GUI Bloopers, by Jeff Johnson; The Humane
Interface, by Jef Raskin; Legal Battles That Shaped the
Software Industry, by Lawrence D. Graham; The World of
Scripting Languages, by David Barron; C for Java
Programmers, by Tomasz Muldner; and XML Elements of
Style, by Simon St. Laurent.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Peter Zhang",
title = "{Webrelay}: a Multithreaded {HTTP} Relay Server",
journal = j-DDJ,
volume = "25",
number = "2",
pages = "86, 88, 90--94, 96",
month = feb,
year = "2000",
ISSN = "1044-789X",
bibdate = "Thu Nov 9 08:25:13 MST 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2000/2000_02/webrelay.txt;
abstract = "Webrelay is a freely available multithreaded HTTP
relay server that authenticates that clients are
legitimate users before they are connected to vendor
web servers. Additional resources include webrelay.txt
(listings) and webrelay.zip (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Anonymous",
title = "Errata: {``Speculative Multithreaded Processors''}",
journal = j-COMPUTER,
volume = "34",
number = "5",
pages = "7--7",
month = may,
year = "2001",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Fri May 4 17:53:39 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
note = "See \cite{Sohi:2001:SMP}.",
URL = "http://dlib.computer.org/co/books/co2001/pdf/r5004.pdf",
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher
and Mark MacBeth and Keith McGuigan and Raymond
title = "Compiling Multithreaded {Java} Bytecode for
Distributed Execution (Distinguished Paper)",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1900",
pages = "1039--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:02:44 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001039.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Gabriel Antoniu and Luc Boug{\'e}",
title = "{DSM-PM2}: a Portable Implementation Platform for
Multithreaded {DSM} Consistency Protocols",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2026",
pages = "55--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:03:43 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2026.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2026/20260055.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Gabriel Antoniu and others",
title = "The {Hyperion} system: {Compiling} multithreaded
{Java} bytecode for distributed execution",
volume = "27",
number = "10",
pages = "1279--1297",
month = sep,
year = "2001",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Fri Feb 22 16:52:42 MST 2002",
bibsource = "http://www.elsevier.com/locate/issn/01678191;
URL = "http://www.elsevier.com/gej-ng/10/35/21/47/40/27/abstract.html;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Isabelle Attali and Denis Caromel and Marjorie Russo",
title = "Graphical Visualization of {Java} Objects, Threads,
and Locks",
volume = "2",
number = "1",
year = "2001",
ISSN = "1541-4922 (print), 1558-1683 (electronic)",
ISSN-L = "1541-4922",
bibdate = "Wed Oct 23 17:47:56 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dsonline.computer.org/0101/features/att0101_print.htm",
acknowledgement = ack-nhfb,
fjournal = "IEEE Distributed Systems Online",
author = "Thomas Ball and Sagar Chaki and Sriram K. Rajamani",
title = "Parameterized Verification of Multithreaded Software
journal = j-LECT-NOTES-COMP-SCI,
volume = "2031",
pages = "158--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:03:48 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2031.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2031/20310158.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Thomas Becker",
title = "Synchronization Monitors For {Win32}",
journal = j-DDJ,
volume = "26",
number = "12",
pages = "46, 48, 50--52, 54",
month = dec,
year = "2001",
ISSN = "1044-789X",
bibdate = "Tue Feb 12 05:21:41 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2001/2001_12/monitor.txt;
abstract = "Thomas presents a Java-style synchronization monitor
for multithreaded Win32 development. Additional
resources include {\tt monitor.txt} (listings) and {\tt
monitor.zip} (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Magnus Broberg and Lars Lundberg and H{\aa}kan Grahn",
title = "Performance Optimization Using Extended Critical Path
Analysis in Multithreaded Programs on Multiprocessors",
journal = j-J-PAR-DIST-COMP,
volume = "61",
number = "1",
pages = "115--136",
day = "1",
month = jan,
year = "2001",
DOI = "https://doi.org/10.1006/jpdc.2000.1667",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Feb 22 15:30:35 MST 2002",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Holger Brunst and Wolfgang E. Nagel and Hans-Christian
title = "Group-Based Performance Analysis for Multithreaded
{SMP} Cluster Applications",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2150",
pages = "148--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:05:53 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500148.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "J. Mark Bull and Darragh O'Neill",
title = "A microbenchmark suite for {OpenMP 2.0}",
journal = j-COMP-ARCH-NEWS,
volume = "29",
number = "5",
pages = "41--48",
month = dec,
year = "2001",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Rohit Chandra and Leonardo Dagum and David Kohr and
Dror Maydan and Jeff McDonald and Ramesh Menon",
title = "Parallel Programming in {OpenMP}",
publisher = pub-MORGAN-KAUFMANN,
address = pub-MORGAN-KAUFMANN:adr,
pages = "xvi + 230",
year = "2001",
ISBN = "1-55860-671-8",
ISBN-13 = "978-1-55860-671-5",
LCCN = "QA76.642 .P38 2001",
bibdate = "Thu Jul 14 11:09:17 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
price = "US\$39.95",
URL = "http://www.mkp.com/books_catalog/catalog.asp?ISBN=1-55860-671-8",
abstract = "The rapid and widespread acceptance of shared memory
multiprocessor architectures has created a pressing
demand for an efficient way to program these systems.
At the same time, developers of technical and
scientific applications in industry and in government
laboratories find they need to parallelize huge volumes
of code in a portable fashion. OpenMP, developed
jointly by several parallel computing vendors to
address these issues, is an industry-wide standard for
programming shared-memory and distributed shared-memory
multiprocessors. It consists of a set of compiler
directives and library routines that extend FORTRAN, C,
and C++ codes to express shared-memory parallelism.
Parallel Programming in OpenMP is the first book to
teach both the novice and expert parallel programmers
how to program using this new standard. The authors,
who helped design and implement OpenMP while at SGI,
bring a depth and breadth to the book as compiler
writers, application developers, and performance
acknowledgement = ack-nhfb,
keywords = "parallel programming (computer science)",
tableofcontents = "Foreword \\
Preface \\
1: Introduction \\
Performance with OpenMP \\
A first glimpse of OpenMP \\
The OpenMP parallel computer \\
Why OpenMP \\
History of OpenMP \\
Navigating the rest of the book \\
2: Getting started with OpenMP \\
3: Exploiting loop-level parallelism \\
Meaning of the parallel do directive \\
Controlling data sharing \\
Removing data dependences \\
Enhancing performance \\
4: Beyond loop-level parallelism, parallel regions \\
5: Synchronization \\
6: Performance",
author = "Jacques {Chassin de Kergommeaux} and Benhur de
Oliveira Stein",
title = "Paj{\'e}: An Extensible Environment for Visualizing
Multi-threaded Programs Executions",
journal = j-LECT-NOTES-COMP-SCI,
volume = "1900",
pages = "133--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:02:44 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000133.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Mark Christiaens",
title = "{JaRec}: Record\slash Replay for Multi-threaded {Java}
crossref = "USENIX:2001:PJV",
pages = "??--??",
year = "2001",
bibdate = "Tue Oct 15 17:45:19 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
URL = "http://www.usenix.org/publications/library/proceedings/jvm01/JVM_wips/S07.pdf",
acknowledgement = ack-nhfb,
author = "Ray Duncan and Duncan Harris and Douglas Reilly and
Craig Rodrigues and Michael Birken and Paul S. Person",
title = "Letters: Plug-in Desupport; Threading and the {.Net}
Framework; {CORBA} Interoperability; Game Over for
{Java}; Totally Wired",
journal = j-DDJ,
volume = "26",
number = "11",
pages = "10, 12",
month = nov,
year = "2001",
ISSN = "1044-789X",
bibdate = "Tue Feb 12 05:21:40 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Orit Edelstein and Eitan Farchi and Yarden Nir and Gil
Ratsaby and Shmuel Ur",
title = "Multithreaded {Java} Program Test Generation",
crossref = "ACM:2001:PAJ",
pages = "181--181",
year = "2001",
bibdate = "Mon May 06 09:31:01 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/18.html;
acknowledgement = ack-nhfb,
keywords = "Java",
author = "Wael R. Elwasif and David E. Bernholdt and James A.
Kohl and G. A. Geist",
title = "An Architecture for a Multi-threaded Harness Kernel",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2131",
pages = "126--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Fri Feb 1 08:13:55 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310126.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Paraskevas Evripidou",
title = "{$ D^3 $-Machine}: a decoupled data-driven
multithreaded architecture with variable resolution
volume = "27",
number = "9",
pages = "1197--1225",
month = aug,
year = "2001",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Wed Jul 18 06:31:16 MDT 2001",
bibsource = "http://www.elsevier.com/locate/issn/01678191;
URL = "http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/abstract.html;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Renato J. O. Figueiredo and Jeffrey P. Bradford and
Jos{\'e} A. B. Fortes",
title = "Improving the Performance of Heterogeneous {DSMs} via
journal = j-LECT-NOTES-COMP-SCI,
volume = "1981",
pages = "168--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:03:02 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1981.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1981/19810168.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Lee Garber",
title = "News Briefs: Is Tech Downturn Changing Education and
Employment Trends; {HTMT} Promises High-Performance
Computing; Controversial Software Law [{UCITA}] Hist
journal = j-COMPUTER,
volume = "34",
number = "10",
pages = "19--21",
month = oct,
year = "2001",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Fri Feb 8 07:11:46 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dlib.computer.org/co/books/co2001/pdf/rx019.pdf;
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
keywords = "hybrid technology multithreaded architecture (HTMT);
Uniform Computer Information Transactions Act (UCITA)",
author = "Travis K. Geiselbrecht",
title = "The {NewOS} Operating System",
journal = j-DDJ,
volume = "26",
number = "12",
pages = "33, 35, 38, 40, 42, 44",
month = dec,
year = "2001",
ISSN = "1044-789X",
bibdate = "Tue Feb 12 05:21:41 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "See correction \cite{Editors:2002:LUC}.",
URL = "http://www.ddj.com/ftp/2001/2001_12/newos.txt;
abstract = "NewOS is a freely available lightweight operating
system written in C for platforms ranging from Intel-
and AMD-based PCs to the Sega Dreamcast. Additional
resources include {\tt newos.txt} (listings) and {\tt
newos.zip} (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Siegfried Goeschl",
title = "The {JUnit++} Testing Tool",
journal = j-DDJ,
volume = "26",
number = "2",
pages = "34, 36--38",
month = feb,
year = "2001",
ISSN = "1044-789X",
bibdate = "Thu Feb 15 12:14:41 MST 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2001/2001_02/junitpp.txt;
abstract = "JUnit++ is a freely available Java unit test framework
that includes a test data repository, command-line
arguments, and a TestRunner class that supports a
built-in repetition counter and multithreading at the
command line. Additional resources include junitpp.txt
(listings) and junitpp.zip (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Richard J. Hanson and Clay P. Breshears and Henry A.
title = "Using a {Fortran} Interface to {POSIX} Threads",
crossref = "Boisvert:2001:ASS",
pages = "257--272",
year = "2001",
bibdate = "Sat Dec 29 09:54:37 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
author = "Andreas Huber",
title = "Elegant Function Call Wrappers",
journal = j-CCCUJ,
volume = "19",
number = "5",
pages = "8--??",
month = may,
year = "2001",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:31 MDT 2002",
bibsource = "http://www.cuj.com/articles/2001/0105/0105toc.htm?topic=articles;
abstract = "Scheduling functions for later execution is an obvious
requirement in multithreaded programs. How to do that
and preserve both type safety and modularity is not so
obvious. The author combines an old pattern and some
new template techniques to pull it off rather nicely.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Takashi Ishihara and Tiejun Li and Eugene F. Fodor and
Ronald A. Olsson",
title = "A Comparison of Concurrent Programming and Cooperative
journal = j-LECT-NOTES-COMP-SCI,
volume = "1900",
pages = "729--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:02:44 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000729.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Chitaka Iwama and Niko Demus Barli and Shuichi Sakai
and Hidehiko Tanaka",
title = "Improving Conditional Branch Prediction on Speculative
Multithreading Architectures",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2150",
pages = "413--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:05:53 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500413.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Kazunori Iwata and Shingo Itabashi and Naohiro Ishii",
title = "A Protocol for Multi-Threaded Processes with Choice in
$ \pi $-Calculus",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2074",
pages = "138--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:04:30 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740138.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "P. Kakulavarapu and O. C. Maquelin and J. N. Amaral
and G. R. Gao",
title = "Dynamic Load Balancers for a Multithreaded
Multiprocessor System",
volume = "11",
number = "1",
pages = "169--??",
month = mar,
year = "2001",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Sat Feb 23 19:27:51 MST 2002",
bibsource = "http://ejournals.wspc.com.sg/ppl/ppl.shtml;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "J{\"o}rg Kienzle and Alexander Romanovsky",
title = "Combining tasking and transactions, part {II}: open
multithreaded transactions",
journal = j-SIGADA-LETTERS,
volume = "21",
number = "1",
pages = "67--74",
month = mar,
year = "2001",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Sat Aug 9 09:06:10 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAda Ada Letters",
author = "J{\"o}rg Kienzle and Alexander Romanovsky",
title = "Implementing exceptions in open multithreaded
transactions based on {Ada 95} exceptions",
journal = j-SIGADA-LETTERS,
volume = "21",
number = "3",
pages = "57--63",
month = sep,
year = "2001",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Sat Aug 9 09:06:11 MDT 2003",
bibsource = "http://www.acm.org/sigada/ada_letters/;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAda Ada Letters",
author = "Iosif {Legrand, on behalf of the MONARC
title = "Multi-threaded, discrete event simulation of
distributed computing systems",
journal = j-COMP-PHYS-COMM,
volume = "140",
number = "1--2",
pages = "274--285",
day = "15",
month = oct,
year = "2001",
DOI = "https://doi.org/10.1016/S0010-4655(01)00281-8",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Mon Feb 13 23:41:04 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465501002818",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "L. Lopes and V. T. Vasconcelos and F. Silva",
title = "Fine-grained multithreading with process calculi",
journal = j-IEEE-TRANS-COMPUT,
volume = "50",
number = "8",
pages = "852--862",
month = aug,
year = "2001",
DOI = "https://doi.org/10.1109/12.947014",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Jul 5 10:03:11 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=947014",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Chi-Keung Luk",
title = "Tolerating memory latency through software-controlled
pre-execution in simultaneous multithreading
journal = j-COMP-ARCH-NEWS,
volume = "29",
number = "2",
pages = "40--51",
month = may,
year = "2001",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:50 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jeremy Manson and William Pugh",
title = "Core Semantics of Multithreaded {Java}",
crossref = "ACM:2001:PAJ",
pages = "29--38",
year = "2001",
bibdate = "Mon May 06 09:31:01 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/42.html;
acknowledgement = ack-nhfb,
keywords = "Java",
author = "Jim Mauro and Richard McDougall",
title = "{Solaris} Internals: Core Kernel Architecture",
address = pub-SUN-MICROSYSTEMS-PRESS:adr,
pages = "xli + 657",
year = "2001",
ISBN = "0-13-022496-0",
ISBN-13 = "978-0-13-022496-5",
LCCN = "QA76.76.O63 M37195 2001",
bibdate = "Fri Apr 11 16:56:49 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/master.bib;
series = "Sun BluePrints Program",
URL = "http://www.sun.com/books/catalog/mauro/index.html",
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
shorttableofcontents = "The Solaris Memory System \\
Threads, Processes, and IPC \\
Files and File Systems \\
Kernel Tunables, Switches, and Limits \\
Kernel Virtual Address Maps",
tableofcontents = "List of Header Files \\
Part 1: Introduction to Solaris Internals \\
1: An Introduction to Solaris \\
2: Kernel Services \\
3: Kernel Synchronization Primitives \\
4: Kernel Bootstrap and Initialization \\
Part 2: The Solaris Memory System \\
5: Solaris Memory Architecture \\
6: Kernel Memory \\
7: Memory Monitoring \\
Part 3: Threads, Processes, and IPC \\
8: The Solaris Multithreaded Process Architecture \\
9: The Solaris Kernel dispatcher \\
10: Interprocess Communication \\
Part 4: Files and File Systems \\
11: Solaris Files and File I/O \\
12: File System Overview \\
13: File System Framework \\
14: The UNIX File System \\
15: Solaris File System Cache \\
Appendix A: Kernel Tunables, Switches, and Limits \\
Appendix B: Kernel Virtual Address Maps \\
Appendix C: A Sample Profs Utility",
author = "Dan Nagle",
title = "Multithreading, {Fthreads}, and {Visual Fortran}",
journal = j-DDJ,
volume = "26",
number = "7",
pages = "36, 38, 40",
month = jul,
year = "2001",
ISSN = "1044-789X",
bibdate = "Thu Jun 7 06:07:17 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2001/2001_07/fthreads.zip",
abstract = "Dan presents a Fortran module that helps you write
multithreaded programs for Windows-based applications.
Additional resources include fthreads.zip (source
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Greg Nakhimovsky",
title = "Improving Scalability Of Multithreaded Dynamic Memory
journal = j-DDJ,
volume = "26",
number = "7",
pages = "44, 46, 48--50, 52, 54",
month = jul,
year = "2001",
ISSN = "1044-789X",
bibdate = "Thu Jun 7 06:07:17 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2001/2001_07/mthot.txt;
abstract = "Multiprocessor/multithreaded environments add a new
dimension to the familiar malloc facility. The
``MT-hot'' implementation Greg presents here lets
multiple threads execute in parallel without major
delays. Additional resources include mthot.txt
(listings) and mthot.zip (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e}
and J. Labarta",
title = "Exploiting memory affinity in {OpenMP} through
schedule reuse",
journal = j-COMP-ARCH-NEWS,
volume = "29",
number = "5",
pages = "49--55",
month = dec,
year = "2001",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Emre {\"O}zer and Thomas M. Conte and Saurabh Sharma",
title = "Weld: a Multithreading Technique Towards
Latency-Tolerant {VLIW} Processors",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2228",
pages = "192--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:07:14 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2228.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2228/22280192.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "James Pang and Gholamali Shoja and Eric Manning",
title = "Providing Soft Real-time {QoS} Guarantees for {Java}
crossref = "ACM:2001:PAJ",
pages = "39--46",
year = "2001",
bibdate = "Mon May 06 09:31:01 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/21.html;
acknowledgement = ack-nhfb,
keywords = "Java",
author = "J.-M. Parcerisa and A. Gonzalez",
title = "Improving latency tolerance of multithreading through
journal = j-IEEE-TRANS-COMPUT,
volume = "50",
number = "10",
pages = "1084--1094",
month = oct,
year = "2001",
DOI = "https://doi.org/10.1109/12.956093",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Jul 5 10:03:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=956093",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Manoj Plakal and Charles N. Fischer",
title = "Concurrent Garbage Collection Using Program Slices on
Multithreaded Processors",
journal = j-SIGPLAN,
volume = "36",
number = "1",
pages = "94--100",
month = jan,
year = "2001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:22 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "ACM SIGPLAN International Symposium on Memory
Management (ISMM'00)",
author = "Boris V. Protopopov and Anthony Skjellum",
title = "A Multithreaded {Message Passing Interface (MPI)}
Architecture: Performance and Program Issues",
journal = j-J-PAR-DIST-COMP,
volume = "61",
number = "4",
pages = "449--466",
day = "1",
month = apr,
year = "2001",
DOI = "https://doi.org/10.1006/jpdc.2000.1674",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Feb 22 15:30:36 MST 2002",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Irfan Pyarali and Marina Spivak and Ron Cytron and
Douglas C. Schmidt",
title = "Evaluating and Optimizing Thread Pool Strategies for
Real-Time {CORBA}",
journal = j-SIGPLAN,
volume = "36",
number = "8",
pages = "214--222",
month = aug,
year = "2001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:29 MST 2003",
bibsource = "http://www.cs.wisc.edu/~bodik/om2001/program.html;
URL = "",
acknowledgement = ack-nhfb,
annote = "OM'01: The First Workshop on Optimization of
Middleware and Distributed Systems",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Douglas Reilly",
title = "Threading and the {.Net} Framework",
journal = j-DDJ,
volume = "26",
number = "8",
pages = "30, 32--33, 36, 38",
month = aug,
year = "2001",
ISSN = "1044-789X",
bibdate = "Wed Jul 11 06:31:35 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.ddj.com/ftp/2001/2001_08/thrednet.txt",
abstract = "Microsoft's .NET Framework offers a number of
features, such as threading, that simplify difficult
tasks. Additional resources include thrednet.txt
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Martin Rinard",
title = "Analysis of Multithreaded Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2126",
pages = "1--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:05:28 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2126.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2126/21260001.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Lucas Roh and Bhanu Shankar and Wim B{\"o}hm and Walid
title = "Resource Management in Dataflow-Based Multithreaded
journal = j-J-PAR-DIST-COMP,
volume = "61",
number = "5",
pages = "581--608",
day = "1",
month = may,
year = "2001",
DOI = "https://doi.org/10.1006/jpdc.2001.1708",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Feb 22 15:30:37 MST 2002",
bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Alexandru Salcianu and Martin Rinard",
title = "Pointer and escape analysis for multithreaded
journal = j-SIGPLAN,
volume = "36",
number = "7",
pages = "12--23",
month = jul,
year = "2001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:28 MST 2003",
bibsource = "http://www.acm.org/pubs/contents/proceedings/series/ppopp/;
URL = "http://www.acm.org/pubs/articles/proceedings/ppopp/379539/p12-salcianu/p12-salcianu.pdf;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "U. Sigmund and T. Ungerer",
title = "On Speculation Control in Simultaneous Multithreaded
journal = j-J-UCS,
volume = "7",
number = "9",
pages = "848--868",
day = "28",
month = sep,
year = "2001",
CODEN = "????",
ISSN = "0948-695X (print), 0948-6968 (electronic)",
ISSN-L = "0948-6968",
bibdate = "Wed Feb 20 07:23:10 MST 2002",
bibsource = "http://www.jucs.org/jucs;
URL = "http://www.jucs.org/jucs_7_9/on_speculation_control_in",
acknowledgement = ack-nhfb,
fjournal = "J.UCS: Journal of Universal Computer Science",
journal-URL = "http://www.jucs.org/jucs",
author = "Burton Smith",
title = "{Cray MTA}: Multithreading for Latency Response",
journal = j-COMPUTER,
volume = "34",
number = "4",
pages = "69--69",
month = apr,
year = "2001",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Sat Apr 7 07:21:35 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
URL = "http://dlib.computer.org/co/books/co2001/pdf/r4059.pdf;
acknowledgement = ack-nhfb,
annote = "Describes the Cray MTA system, which has up to 256
multithreaded processors. There are no data caches:
instead, each processor switches context every cycle
among up to 128 instruction streams, and each stream
can have up to eight outstanding memory references, so
memory latency up to 1024 cycles does not delay
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Gurindar S. Sohi and Amir Roth",
title = "Speculative Multithreaded Processors",
journal = j-COMPUTER,
volume = "34",
number = "4",
pages = "66--73",
month = apr,
year = "2001",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Sat Apr 7 07:21:35 MDT 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib;
note = "See errata \cite{Anonymous:2001:ESM}.",
URL = "http://dlib.computer.org/co/books/co2001/pdf/r4066.pdf;
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Andrew Sohn and Yuetsu Kodama and Jui-Yuan Ku and
Mitsuhisa Sato and Yoshinori Yamaguchi",
title = "Chapter 15. {Tolerating} Communication Latency through
Dynamic Thread Invocation in a Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "1808",
pages = "525--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:02:34 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1808.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1808/18080525.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Bjarne Steensgaard",
title = "Thread-Specific Heaps for Multi-Threaded Programs",
journal = j-SIGPLAN,
volume = "36",
number = "1",
pages = "18--24",
month = jan,
year = "2001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sun Dec 14 09:18:22 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "ACM SIGPLAN International Symposium on Memory
Management (ISMM'00)",
author = "Michael Sung and Ronny Krashinsky and Krste
title = "Multithreading decoupled architectures for
complexity-effective general purpose computing",
journal = j-COMP-ARCH-NEWS,
volume = "29",
number = "5",
pages = "56--61",
month = dec,
year = "2001",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal
and Gerd Heber and Ruppa K. Thulasiram and Guang R.
title = "Developing a Communication Intensive Application on
the {EARTH} Multithreaded Architecture (Distinguished
journal = j-LECT-NOTES-COMP-SCI,
volume = "1900",
pages = "625--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:02:44 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000625.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Gary Zoppetti and Gagan Agrawal and Rishi Kumar",
title = "Impact of Data Distribution on Performance of
Irregular Reductions on Multithreaded Architectures",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2110",
pages = "483--??",
year = "2001",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Feb 2 13:05:11 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2110.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2110/21100483.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Erika {\'A}brah{\'a}m-Mumm and Frank S. de Boer and
Willem-Paul de Roever and Martin Steffen",
title = "Verification for {Java}'s Reentrant Multithreading
journal = j-LECT-NOTES-COMP-SCI,
volume = "2303",
pages = "5--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:09:21 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2303.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2303/23030005.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Matthew Adiletta and Mark Rosenbluth and Debra
Bernstein and Gilbert Wolrich and Hugh Wilkinson",
title = "The Next Generation of {Intel IXP} Network
journal = j-INTEL-TECH-J,
volume = "6",
number = "3",
pages = "6--18",
day = "15",
month = aug,
year = "2002",
ISSN = "1535-766X",
bibdate = "Sun Nov 17 11:06:06 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/p01_abstract.htm;
keywords = "10Gb/s; ATM; communication architecture; Ethernet;
IXP; microprocessor architecture; multi-processors;
multi-service switches; multi-threading; network
processors; OC-192; OC-48; routing; switching",
author = "Matthew Adiletta and Donald Hooper and Myles Wilde",
title = "Packet over {SONET}: Achieving 10 {Gigabit}/sec Packet
Processing with an {IXP2800}",
journal = j-INTEL-TECH-J,
volume = "6",
number = "3",
pages = "29--39",
day = "15",
month = aug,
year = "2002",
ISSN = "1535-766X",
bibdate = "Sun Nov 17 11:06:06 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/p01_abstract.htm;
keywords = "10Gbs; ATM; communication architecture; Ethernet;
hardware-based multi-threading; IXP; microprocessor
architecture; multi-processors; multi-service switches;
network processors; OC-192; OC-48; routing; switching",
author = "Anonymous",
title = "Speculative threads",
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "5",
pages = "??--??",
month = dec,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:23 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Gabriel Antoniu and Luc Boug{\'e}",
title = "Implementing Multithreaded Protocols for Release
Consistency on Top of the Generic {DSM}-{PM} Platform",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2326",
pages = "179--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:09:32 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2326.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2326/23260179.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "John H. Baldwin",
title = "Locking in the Multithreaded {FreeBSD} Kernel",
crossref = "USENIX:2002:PBF",
pages = "27--35",
year = "2002",
bibdate = "Tue Oct 15 12:37:27 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.usenix.org/publications/library/proceedings/bsdcon02/baldwin.html",
acknowledgement = ack-nhfb,
author = "B. Balis and M. Bubak and W. Funika and R.
title = "A Concept of Portable Monitoring of Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "2330",
pages = "884--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:09:35 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2330.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2330/23300884.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "G{\'e}rard Boudol and Ilaria Castellani",
title = "Noninterference for concurrent programs and thread
journal = j-THEOR-COMP-SCI,
volume = "281",
number = "1-2",
pages = "109--130",
month = may,
year = "2002",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Wed Nov 20 18:08:56 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "L. Boug{\'e} and V. Danjean and R. Namyst",
title = "Improving Reactivity to {I/O} Events in Multithreaded
Environments Using a Uniform, Scheduler-Centric {API}",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2400",
pages = "605--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Thu Sep 12 08:40:04 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000605.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Gordon Brebner",
title = "Multithreading for Logic-Centric Systems",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2438",
pages = "5--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:10:28 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2438.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2438/24380005.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "John Callaway",
title = "Visualization of threads in a running {Java} program",
type = "Thesis ({M.S.})",
school = "University of California, Santa Cruz",
address = "Santa Cruz, CA, USA",
year = "2002",
LCCN = "QA76.73.J38 C36 2002",
bibdate = "Tue May 6 05:26:58 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "academic dissertations -- University of California,
Santa Cruz -- 2002; academic dissertations --
University of California, Santa Cruz -- computer;
computer science; computer software -- development;
Java (computer program language); object-oriented
programming (computer science); science; software
engineering; visualization",
author = "Christopher D. Carothers and Boleslaw K. Szymanski",
title = "Checkpointing Multithreaded Programs",
journal = j-DDJ,
volume = "27",
number = "8",
pages = "??--??",
month = aug,
year = "2002",
ISSN = "1044-789X",
bibdate = "Fri Sep 13 06:15:52 MDT 2002",
bibsource = "http://www.ddj.com/articles/2002/0208/;
URL = "http://www.ddj.com/ftp/2002/2002_08/checkpt.txt",
abstract = "Checkpointing is the process by which you grab
snapshots of running programs. Additional resources
include checkpt.txt (listings).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Fr{\'e}d{\'e}ric Cazals",
title = "Non-Intrusive Debugging and Incremental Visualization
with the Geometric Stethoscope",
journal = j-J-GRAPHICS-TOOLS,
volume = "7",
number = "2",
pages = "27--40",
year = "2002",
ISSN = "1086-7651",
bibdate = "Tue Dec 16 13:47:48 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/jgt/papers/Cazals02/",
abstract = "Developing and debugging geometric applications is
known to be a difficult task: The calculations and data
structures can be involved, there are degenerate cases
and numerical issues, etc. THis paper presents a
software setup aiming at easing the development, the
debugging, ad the maintenance of geometric
applications. \par
More precisely, {\em incremental visualization\/} is
defined as the possibility for the programmer to
visualize interactively any significant update of the
geometric data structures at any time. {\em
Non-intrusive debugging\/} is defined as the
possibility of visualizing any geometric entity in
three dimensions from a standard debugger at any time
without modifying the source code. We present a setup
to perform incremental visualization and non-intrusive
debugging. This setup is based on multithreading and
requires a three-dimensional viewer, such as Open
Inventor, Vtk, or Geomview, and a standard debugger
(dbx or gdb). \par
An Open Inventor based C++ implementation of this setup
accompanies this paper. Using it simply requires
writing the functions converting the user's data
structures into Open Inventor's data structures. The
setup could easily be extended to accommodate other
medias such as sound, video, etc.",
acknowledgement = ack-nhfb,
fjournal = "Journal of Graphics Tools: JGT",
journal-URL = "http://www.tandfonline.com/loi/ujgt20",
author = "Robert S. Chappell and Francis Tseng and Adi Yoaz and
Yale N. Patt",
title = "Difficult-path branch prediction using subordinate
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "2",
pages = "307--317",
month = may,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:50 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Puneesh Chaudhry",
title = "A Per-Thread Singleton Class",
journal = j-CCCUJ,
volume = "20",
number = "5",
pages = "14--??",
month = may,
year = "2002",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:36 MDT 2002",
bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
abstract = "A refreshing look at an old pattern.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Jong-Deok Choi and Keunwoo Lee and Alexey Loginov and
Robert O'Callahan and Vivek Sarkar and Manu Sridharan",
title = "Efficient and precise datarace detection for
multithreaded object-oriented programs",
journal = j-SIGPLAN,
volume = "37",
number = "5",
pages = "258--269",
month = may,
year = "2002",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:02 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Jong-Deok Choi and Andreas Zeller",
title = "Isolating failure-inducing thread schedules",
journal = j-SIGSOFT,
volume = "27",
number = "4",
pages = "210--220",
month = jul,
year = "2002",
DOI = "https://doi.org/10.1145/566171.566211",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:14:20 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Consider a multi-threaded application that
occasionally fails due to non-determinism. Using the
DEJAVU capture/replay tool, it is possible to record
the thread schedule and replay the application in a
deterministic way. By systematically narrowing down the
difference between a thread schedule that makes the
program pass and another schedule that makes the
program fail, the Delta Debugging approach can pinpoint
the error location automatically---namely, the
location(s) where a thread switch causes the program to
fail. In a case study, Delta Debugging isolated the
failure-inducing schedule difference from 3.8 billion
differences in only 50 tests.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Keith Clark and Peter J. Robinson",
title = "Agents as Multi-threaded Logical Objects",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2407",
pages = "33--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:10:17 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2407.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2407/24070033.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Jonathan J. Cook",
title = "Reverse Execution of {Java} Bytecode",
journal = j-COMP-J,
volume = "45",
number = "6",
pages = "608--619",
month = "????",
year = "2002",
DOI = "https://doi.org/10.1093/comjnl/45.6.608",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Wed Nov 6 11:21:54 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compj2000.bib;
URL = "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/450608.sgm.abs.html;
abstract = "We demonstrate a model, including operational
semantics, for the reverse execution of stack-based
code. We discuss our modification of the Kaffe
implementation of the Java Virtual Machine, supporting
a debugger capable of running Java bytecode backwards.
We achieve reverse execution by logging the state lost
during each operation or by directly reversing
instructions. Our debugger has facilities for stepping,
stepping over methods and running to breakpoints, in
both directions. Multi-threading is supported. It is
also possible to step through the bytecode when the
Java source code is not available. The debugger has
both a command line user interface and a graphical user
interface with facilities for editing code and running
the Java compiler.",
acknowledgement = ack-nhfb,
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
author = "Giorgio Delzanno and Jean-Fran{\c{c}}ois Raskin and
Laurent {Van Begin}",
title = "Towards the Automated Verification of Multithreaded
{Java} Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2280",
pages = "173--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 10 19:09:09 MDT 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2280.htm;
URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2280/22800173.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Yun He and Chris H. Q. Ding",
key = "multidimensional arrays; index reshuffle; vacancy
tracking cycles; global exchange; dynamical remapping;
MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.",
title = "{MPI} and {OpenMP} Paradigms on Cluster of {SMP}
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf",
abstract = "We investigate remapping multi-dimensional arrays on
cluster of SMP architectures under OpenMP, MPI, and
hybrid paradigms. Traditional method of array transpose
needs an auxiliary array of the same size and a copy
back stage. We recently developed an in-place method
using vacancy tracking cycles. The vacancy tracking
algorithm outperforms the traditional 2-array method as
demonstrated by extensive comparisons. The independence
of vacancy tracking cycles allows efficient
parallelization of the in-place method on SMP
architectures at node level. Performance of
multi-threaded parallelism using OpenMP are tested with
different scheduling methods and different number of
threads. The vacancy tracking method is parallelized
using several parallel paradigms. At node level, pure
OpenMP outperforms pure MPI by a factor of 2.76. Across
entire cluster of SMP nodes, the hybrid MPI/OpenMP
implementation outperforms pure MPI by a factor of
4.44, demonstrating the validity of the parallel
paradigm of mixing MPI with OpenMP.",
acknowledgement = ack-nhfb,
author = "Austin Donnelly",
title = "Lightweight Thread Tunnelling in Network
journal = j-LECT-NOTES-COMP-SCI,
volume = "2546",
pages = "48--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Nov 30 20:58:13 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2546.htm;
URL = "http://link.springer.de/link/service/series/0558/bibs/2546/25460048.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "O. Edelstein and E. Farchi and Y. Nir and G. Ratsaby
and S. Ur",
title = "Multithreaded {Java} program test generation",
journal = j-IBM-SYS-J,
volume = "41",
number = "1",
pages = "111--125",
month = "????",
year = "2002",
ISSN = "0018-8670",
bibdate = "Tue Feb 12 17:23:05 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/sj/411/edelstein.html;
acknowledgement = ack-nhfb,
fjournal = "IBM Systems Journal",
ordernumber = "G321-0144",
author = "{The Editors} and Kim Reidar Lantz and Ze'ev Atlas and
Pete Nelson and Gus J. Grubba",
title = "Letters: {URL} Correction [``{The NewOS Operating
System}'']; Passing Context to Threads; Compiling
{Perl\slash Tk} Scripts; Standing by {Al}'s Principles;
Understanding Photomosaics",
journal = j-DDJ,
volume = "27",
number = "1",
pages = "10, 12",
month = jan,
year = "2002",
ISSN = "1044-789X",
bibdate = "Tue Feb 12 05:21:41 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "See \cite{Geiselbrecht:2001:NOS}.",
URL = "http://www.ddj.com/",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet",
title = "{UPC} Performance and Potential: a {NPB}
Experimental Study",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf",
abstract = "UPC, or Unified Parallel C, is a parallel extension of
ANSI C. UPC follows a distributed shared memory
programming model aimed at leveraging the ease of
programming of the shared memory paradigm, while
enabling the exploitation of data locality. UPC
incorporates constructs that allow placing data near
the threads that manipulate them to minimize remote
accesses. This paper gives an overview of the concepts
and features of UPC and establishes, through extensive
performance measurements of NPB workloads, the
viability of the UPC programming language compared to
the other popular paradigms. Further, through
performance measurements we identify the challenges,
the remaining steps and the priorities for UPC. It will
be shown that with proper hand tuning libraries, UPC
performance will be comparable incorporating such
improvements into automatic compare quite favorably to
message passing in ease and optimized collective
operations to that of MPI. Furthermore, by compiler
optimizations, UPC will of programming.",
acknowledgement = ack-nhfb,
keywords = "NPB (NAS Parallel Benchmark)",
author = "E. Feuerstein and A. Strejilevich de Loma",
title = "On-Line Multi-Threaded Paging",
journal = j-ALGORITHMICA,
volume = "32",
number = "1",
pages = "36--60",
month = jan,
year = "2002",
DOI = "https://doi.org/10.1007/s00453-001-0073-z",
ISSN = "0178-4617 (print), 1432-0541 (electronic)",
ISSN-L = "0178-4617",
MRclass = "68N25 (68Q10 68W05)",
MRnumber = "MR1867023 (2002h:68033)",
bibdate = "Fri Jan 6 11:38:14 MST 2006",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0178-4617&volume=32&issue=1;
MathSciNet database",
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0178-4617&volume=32&issue=1&spage=36",
acknowledgement = ack-nhfb,
fjournal = "Algorithmica",
journal-URL = "http://link.springer.com/journal/453",
author = "Cormac Flanagan and Shaz Qadeer and Sanjit A. Seshia",
title = "A Modular Checker for Multithreaded Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2404",
pages = "180--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Nov 30 20:57:05 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2404.htm;
URL = "http://link.springer.de/link/service/series/0558/bibs/2404/24040180.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Rajat P. Garg and Ilya Sharapov",
title = "Techniques for optimizing applications: high
performance computing",
address = pub-SUN-MICROSYSTEMS-PRESS:adr,
pages = "xliii + 616",
year = "2002",
ISBN = "0-13-093476-3",
ISBN-13 = "978-0-13-093476-5",
LCCN = "QA76.88 .G37 2002",
bibdate = "Fri Apr 11 08:26:42 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
series = "Sun BluePrints Program",
URL = "http://www.sun.com/books/catalog/garg.html/index.html",
acknowledgement = ack-nhfb,
annote = "From the Web site: The \verb=HPC_code_examples.tar.Z=
tar-file contains the source code, makefiles, and shell
scripts required to compile, link, and run the example
programs discussed in the book.",
keywords = "Forte Developer; MPI; OpenMP; Sun ClusterTools; Sun
author = "Peter Haggar",
title = "{Java Q\&A}: Does {Java} Guarantee Thread Safety?",
journal = j-DDJ,
volume = "27",
number = "6",
pages = "91--83",
month = jun,
year = "2002",
ISSN = "1044-789X",
bibdate = "Wed May 1 15:43:59 MDT 2002",
bibsource = "http://www.ddj.com/articles/2002/0206/;
note = "Comments on lack of atomic-update guarantee in Java
for objects larger than 32 bits, such as {\tt long} and
{\tt double}, with sample code to exhibit the
URL = "http://www.ddj.com/ftp/2002/2002_06/jqa0602.txt",
abstract = "Additional resources include jqa0602.txt (listings).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Richard J. Hanson and Clay P. Breshears and Henry A.
title = "{Algorithm 821}: a {Fortran} interface to {POSIX}
journal = j-TOMS,
volume = "28",
number = "3",
pages = "354--371",
month = sep,
year = "2002",
DOI = "https://doi.org/10.1145/569147.569152",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Sat Nov 9 11:16:50 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Pthreads is the library of POSIX standard functions
for concurrent, multithreaded programming. The POSIX
standard only defines an application programming
interface (API) to the C programming language, not to
Fortran. Many scientific and engineering applications
are written in Fortran. Also, many of these
applications exhibit functional, or task-level,
concurrency. They would benefit from multithreading,
especially on symmetric multiprocessors (SMP). We
present here an interface to that part of the Pthreads
library that is compatible with standard Fortran. The
contribution consists of two primary source files: a
Fortran module and a collection of C wrappers to
Pthreads functions. The Fortran module defines the data
structures, interface and initialization routines used
to manage threads. The stability and portability of the
Fortran API to Pthreads is demonstrated using common
mathematical computations on three different systems.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Suchuan Dong and George Em. Karniadakis",
title = "Dual-Level Parallelism for Deterministic and
Stochastic {CFD} Problems",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf",
abstract = "A hybrid two-level parallelism using MPI/OpenMP is
implemented in the general-purpose spectral/hp element
CFD code NekTar to take advantage of the hierarchical
structures arising in deterministic and stochastic CFD
problems. We take a coarse grain approach to
shared-memory parallelism with OpenMP and employ a
workload-splitting scheme that can reduce the OpenMP
synchronizations to the minimum. The hybrid
implementation shows good scalability with respect to
both the problem size and the number of processors in
case of a fixed problem size. With the same number of
processors, the hybrid model with 2 (or 4) OpenMP
threads per MPI process is observed to perform better
than pure MPI and pure OpenMP on the NCSA SGI Origin
2000, while the pure MPI model performs the best on the
IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC.
A key new result is that the use of threads facilitates
effectively prefinement, which is crucial to adaptive
discretization using high-order methods.",
acknowledgement = ack-nhfb,
author = "Krishna M. Kavi and Alireza Moshtaghi and Deng-jyi
title = "Modeling Multithreaded Applications Using {Petri}
journal = j-INT-J-PARALLEL-PROG,
volume = "30",
number = "5",
pages = "353--371",
month = oct,
year = "2002",
DOI = "https://doi.org/10.1023/A:1019917329895",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 6 16:40:00 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=30&issue=5;
URL = "http://ipsapp009.lwwonline.com/content/getfile/4773/29/1/abstract.htm;
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Bill Kempf",
title = "The {Boost.Threads} Library",
journal = j-CCCUJ,
volume = "20",
number = "5",
pages = "6--??",
month = may,
year = "2002",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:36 MDT 2002",
bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
abstract = "Standard C++ threads are imminent. CUJ predicts they
will derive from the Boost.Threads library, explored
here by the eminent author.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Cosimo Laneve",
title = "A type system for {JVM} threads",
journal = j-THEOR-COMP-SCI,
volume = "290",
number = "1",
pages = "741--778",
month = oct,
year = "2002",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Wed Nov 20 18:15:29 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "Dmitri Leman",
title = "An Efficient and Flexible Tracing Technique",
journal = j-CCCUJ,
volume = "20",
number = "4",
pages = "24--??",
month = apr,
year = "2002",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:36 MDT 2002",
bibsource = "http://www.cuj.com/articles/2002/0204/0204toc.htm?topic=articles;
abstract = "This extensible tracing framework tames the dreaded
multithreaded debugging demon.",
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "G. Mahinthakumar and F. Saied",
title = "A Hybrid {MPI-OpenMP} Implementation of an Implicit
Finite-Element Code on Parallel Architectures",
journal = j-IJHPCA,
volume = "16",
number = "4",
pages = "371--393",
month = "Winter",
year = "2002",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Fri Nov 28 06:52:13 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
journal-URL = "http://hpc.sagepub.com/content/by/year",
author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
title = "Speculative synchronization: applying thread-level
speculation to explicitly parallel applications",
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "5",
pages = "18--29",
month = dec,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:23 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
title = "Speculative synchronization: applying thread-level
speculation to explicitly parallel applications",
journal = j-SIGPLAN,
volume = "37",
number = "10",
pages = "18--29",
month = oct,
year = "2002",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:09 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
title = "Speculative synchronization: applying thread-level
speculation to explicitly parallel applications",
journal = j-OPER-SYS-REV,
volume = "36",
number = "5",
pages = "18--29",
month = dec,
year = "2002",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Carl J. Mauer and Mark D. Hill and David A. Wood",
title = "Full-system timing-first simulation",
journal = j-SIGMETRICS,
volume = "30",
number = "1",
pages = "108--116",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/511334.511349",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:38:22 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Computer system designers often evaluate future design
alternatives with detailed simulators that strive for
{\em functional fidelity\/} (to execute relevant
workloads) and {\em performance fidelity\/} (to rank
design alternatives). Trends toward multi-threaded
architectures, more complex micro-architectures, and
richer workloads, make authoring detailed simulators
increasingly difficult. To manage simulator complexity,
this paper advocates decoupled simulator organizations
that separate functional and performance concerns.
Furthermore, we define an approach, called {\em
timing-first simulation}, that uses an augmented timing
simulator to execute instructions important to
performance in conjunction with a functional simulator
to insure correctness. This design simplifies software
development, leverages existing simulators, and can
model micro-architecture timing in detail. We describe
the timing-first organization and our experiences
implementing TFsim, a full-system multiprocessor
performance simulator. TFsim models a pipelined,
out-of-order micro-architecture in detail, was
developed in less than one person-year, and performs
competitively with previously-published simulators.
TFsim's timing simulator implements dynamically common
instructions (99.99\% of them), while avoiding the vast
and exacting implementation efforts necessary to run
unmodified commercial operating systems and workloads.
Virtutech Simics, a full-system functional simulator,
checks and corrects the timing simulator's execution,
contributing 18-36\% to the overall run-time. TFsim's
mostly correct functional implementation introduces a
worst-case performance error of 4.8\% for our
commercial workloads. Some additional simulator
performance is gained by verifying functional
correctness less often, at the cost of some additional
performance error.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Shubhendu S. Mukherjee and Michael Kontz and Steven K.
title = "Detailed design and evaluation of redundant
multithreading alternatives",
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "2",
pages = "99--110",
month = may,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:50 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jeffrey Oplinger and Monica S. Lam",
title = "Enhancing software reliability with speculative
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "5",
pages = "184--196",
month = dec,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:23 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jeffrey Oplinger and Monica S. Lam",
title = "Enhancing software reliability with speculative
journal = j-SIGPLAN,
volume = "37",
number = "10",
pages = "184--196",
month = oct,
year = "2002",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:09 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Jeffrey Oplinger and Monica S. Lam",
title = "Enhancing software reliability with speculative
journal = j-OPER-SYS-REV,
volume = "36",
number = "5",
pages = "184--196",
month = dec,
year = "2002",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Tomas Plachetka",
title = "(Quasi-) Thread-Safe {PVM} and (Quasi-) Thread-Safe
{MPI} without Active Polling",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2474",
pages = "296--??",
year = "2002",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Sat Nov 30 20:57:35 MST 2002",
bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm;
URL = "http://link.springer.de/link/service/series/0558/bibs/2474/24740296.htm;
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Y. Sato",
title = "A Study of {Java} Language for Effective Thread
journal = "Record of Electrical and Communication Engineering
Conversazione Tohoku University",
volume = "71",
number = "1",
publisher = "Tohoku Daigaku Dentsu Danwakai",
pages = "597--598",
year = "2002",
CODEN = "????",
ISSN = "0385-7719",
bibdate = "Tue Dec 24 07:09:37 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
author = "Ching-Kuang Shene",
title = "{ThreadMentor}: a system for teaching multithreaded
journal = j-SIGCSE,
volume = "34",
number = "3",
pages = "229--229",
month = sep,
year = "2002",
DOI = "https://doi.org/10.1145/637610.544497",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:56:56 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Allan Snavely and Dean M. Tullsen and Geoff Voelker",
title = "Symbiotic jobscheduling with priorities for a
simultaneous multithreading processor",
journal = j-SIGMETRICS,
volume = "30",
number = "1",
pages = "66--76",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/511399.511343",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:38:22 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Simultaneous Multithreading machines benefit from
jobscheduling software that monitors how well
coscheduled jobs share CPU resources, and coschedules
jobs that interact well to make more efficient use of
those resources. As a result, informed coscheduling can
yield significant performance gains over naive
schedulers. However, prior work on coscheduling focused
on equal-priority job mixes, which is an unrealistic
assumption for modern operating systems. This paper
demonstrates that a scheduler for an SMT machine can
both satisfy process priorities and symbiotically
schedule low and high priority threads to increase
system throughput. Naive priority schedulers dedicate
the machine to high priority jobs to meet priority
goals, and as a result decrease opportunities for
increased performance from multithreading and
coscheduling. More informed schedulers, however, can
dynamically monitor the progress and resource
utilization of jobs on the machine, and dynamically
adjust the degree of multithreading to improve
performance while still meeting priority goals. Using
detailed simulation of an SMT architecture, we
introduce and evaluate a series of five software and
hardware-assisted priority schedulers. Overall, our
results indicate that coscheduling priority jobs can
significantly increase system throughput by as much as
40\%, and that (1) the benefit depends upon the
relative priority of the coscheduled jobs, and (2) more
sophisticated schedulers are more effective when the
differences in priorities are greatest. We show that
our priority schedulers can decrease average turnaround
times for a random job mix by as much as 33\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "job scheduling; priorities; simultaneous
author = "Angela C. Sodan",
title = "Applications on a multithreaded architecture: a case
study with {EARTH-MANNA}",
volume = "28",
number = "1",
pages = "3--33",
month = jan,
year = "2002",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Fri Feb 22 16:52:43 MST 2002",
bibsource = "http://www.elsevier.com/locate/issn/01678191;
URL = "http://www.elsevier.com/gej-ng/10/35/21/60/27/28/abstract.html;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Yan Solihin and Jaejin Lee and Josep Torrellas",
title = "Using a user-level memory thread for correlation
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "2",
pages = "171--182",
month = may,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:50 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Thomas L. Sterling and Hans P. Zima",
title = "{Gilgamesh}: a Multithreaded Processor-In-Memory
Architecture for Petaflops Computing",
crossref = "IEEE:2002:STI",
pages = "??--??",
year = "2002",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf",
abstract = "Processor-in-Memory (PIM) architectures avoid the von
Neumann bottleneck in conventional machines by
integrating high-density DRAM and CMOS logic on the
same chip. Parallel systems based on this new
technology are expected to provide higher scalability,
adaptability, robustness, fault tolerance and lower
power consumption than current MPPs or commodity
clusters. In this paper we describe the design of
Gilgamesh, a PIM-based massively parallel architecture,
and elements of its execution model. Gilgamesh extends
existing PIM capabilities by incorporating advanced
mechanisms for virtualizing tasks and data and
providing adaptive resource management for load
balancing and latency tolerance. The Gilgamesh
execution model is based on macroservers, a middleware
layer which supports object-based runtime management of
data and threads allowing explicit and dynamic control
of locality and load balancing. The paper concludes
with a discussion of related research activities and an
outlook to future work.",
acknowledgement = ack-nhfb,
author = "Scott D. Stoller",
title = "Model-checking multi-threaded distributed {Java}
volume = "4",
number = "1",
pages = "71--91",
month = oct,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1007/s10009-002-0077-2",
ISSN = "1433-2779 (print), 1433-2787 (electronic)",
ISSN-L = "1433-2779",
bibdate = "Tue Nov 23 15:01:41 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "International Journal on Software Tools for Technology
Transfer: STTT",
author = "Minyoung Sung and Soyoung Kim and Sangsoo Park and
Naehyuck Chang and Heonshik Shin",
title = "Comparative performance evaluation of {Java} threads
for embedded applications: {Linux Thread} vs. {Green
journal = j-INFO-PROC-LETT,
volume = "84",
number = "4",
pages = "221--225",
day = "30",
month = nov,
year = "2002",
ISSN = "0020-0190 (print), 1872-6119 (electronic)",
ISSN-L = "0020-0190",
bibdate = "Mon Jan 26 08:44:30 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Information Processing Letters",
journal-URL = "http://www.sciencedirect.com/science/journal/00200190",
author = "Patrick Tennberg",
title = "Refactoring Global Objects in Multithreaded
journal = j-CCCUJ,
volume = "20",
number = "5",
pages = "20--??",
month = may,
year = "2002",
ISSN = "1075-2838",
bibdate = "Tue May 14 18:09:36 MDT 2002",
bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles;
abstract = "Although you may get fired for introducing any new
global variables, it's too much work to rewrite old
code to remove them. So make them thread-safe and stop
acknowledgement = ack-nhfb,
fjournal = "C/C++ Users Journal",
author = "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal
and Gerd Heber and Ruppa K. Thulasiram and Guang R.
title = "Implementation and evaluation of a communication
intensive application on the {EARTH} multithreaded
journal = j-CCPE,
volume = "14",
number = "3",
pages = "183--201",
month = mar,
year = "2002",
DOI = "https://doi.org/10.1002/cpe.604",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat May 18 14:54:00 MDT 2002",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/93513486/START;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
author = "Parimala Thulasiraman and Kevin Theobald and Ashfaq A.
Khokhar and Guang R. Gao",
title = "Efficent Multithreaded Algorithms for the {Fast
Fourier Transform}",
volume = "5",
number = "2",
pages = "239--258",
month = jun,
year = "2002",
CODEN = "????",
ISSN = "1097-2803",
bibdate = "Thu Sep 2 12:08:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "PDCP: Parallel and Distributed Computing Practices",
author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij
title = "Multithreaded Processors",
journal = j-COMP-J,
volume = "45",
number = "3",
pages = "320--348",
month = "????",
year = "2002",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Fri May 10 10:12:07 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/450320.sgm.abs.html;
acknowledgement = ack-nhfb,
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij
title = "A survey of processors with explicit multithreading",
journal = j-COMP-SURV,
volume = "35",
number = "1",
pages = "29--63",
month = mar,
year = "2002",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Thu Aug 7 06:57:01 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
author = "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng",
title = "Transient-fault recovery using simultaneous
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "2",
pages = "87--98",
month = may,
year = "2002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:50 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Hong Wang and Perry H. Wang and Ross Dave Weldon and
Scott M. Ettinger and Hideki Saito and Milind Girkar
and Steve Shih-wei Liao and John P. Shen",
title = "Speculative Precomputation: Exploring the Use of
Multithreading for Latency Tools",
journal = j-INTEL-TECH-J,
volume = "6",
number = "1",
pages = "22--35",
month = feb,
year = "2002",
ISSN = "1535-766X",
bibdate = "Thu Feb 28 15:24:21 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://developer.intel.com/technology/itj/2002/volume06issue01/vol6iss1_hyper_threading_technology.pdf",
author = "C. Yan",
title = "Race condition and concurrency safety of multithreaded
object-oriented programming in {Java}",
journal = "IEEE International Conference on Systems Man and
volume = "6",
pages = "??--??",
year = "2002",
CODEN = "????",
ISSN = "1062-922X",
bibdate = "Tue Apr 8 06:53:44 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
xxpages = "WA1Q3",
author = "Antonia Zhai and Christopher B. Colohan and J. Gregory
Steffan and Todd C. Mowry",
title = "Compiler optimization of scalar value communication
between speculative threads",
journal = j-COMP-ARCH-NEWS,
volume = "30",
number = "5",
pages = "171--183",
month = dec,
year = "2002",
ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:23 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Antonia Zhai and Christopher B. Colohan and J. Gregory
Steffan and Todd C. Mowry",
title = "Compiler optimization of scalar value communication
between speculative threads",
journal = j-SIGPLAN,
volume = "37",
number = "10",
pages = "171--183",
month = oct,
year = "2002",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:09 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "W. M. Zuberek",
title = "Analysis of Performance Bottlenecks in Multithreaded
Multiprocessor Systems",
journal = j-FUND-INFO,
volume = "50",
number = "2",
pages = "223--241",
month = feb,
year = "2002",
ISSN = "0169-2968 (print), 1875-8681 (electronic)",
ISSN-L = "0169-2968",
bibdate = "Sat Mar 5 16:59:23 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib;
acknowledgement = ack-nhfb,
fjournal = "Fundamenta Informaticae",
journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae",
author = "Tor M. Aamodt and Pedro Marcuello and Paul Chow and
Antonio Gonz{\'a}lez and Per Hammarlund and Hong Wang
and John P. Shen",
title = "A framework for modeling and optimization of prescient
instruction prefetch",
journal = j-SIGMETRICS,
volume = "31",
number = "1",
pages = "13--24",
month = jun,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1145/781027.781030",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Thu Jun 26 11:41:41 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper describes a framework for modeling
macroscopic program behavior and applies it to
optimizing prescient instruction prefetch --- novel
technique that uses helper threads to improve
single-threaded application performance by performing
judicious and timely instruction prefetch. A helper
thread is initiated when the main thread encounters a
spawn point, and prefetches instructions starting at a
distant target point. The target identifies a code
region tending to incur I-cache misses that the main
thread is likely to execute soon, even though
intervening control flow may be unpredictable. The
optimization of spawn-target pair selections is
formulated by modeling program behavior as a Markov
chain based on profile statistics. Execution paths are
considered stochastic outcomes, and aspects of program
behavior are summarized via path expression mappings.
Mappings for computing reaching, and posteriori
probability; path length mean, and variance; and
expected path footprint are presented. These are used
with Tarjan's fast path algorithm to efficiently
estimate the benefit of spawn-target pair selections.
Using this framework we propose a spawn-target pair
selection algorithm for prescient instruction prefetch.
This algorithm has been implemented, and evaluated for
the Itanium Processor Family architecture. A limit
study finds 4.8\%to 17\% speedups on an in-order
simultaneous multithreading processor with eight
contexts, over nextline and streaming I-prefetch for a
set of benchmarks with high I-cache miss rates. The
framework in this paper is potentially applicable to
other thread speculation techniques.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "analytical modeling; helper threads; instruction
prefetch; multithreading; optimization; path
author = "E. Abraham and F. S. deBoer and W. P. deRoever and M.
title = "A Tool-Supported Proof System for Multithreaded
journal = j-LECT-NOTES-COMP-SCI,
volume = "2852",
pages = "1--32",
year = "2003",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Nov 11 05:21:36 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "C. Addison and Y. Ren and M. van Waveren",
title = "{OpenMP} issues arising in the development of parallel
{BLAS} and {LAPACK} libraries",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "95--104",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and
Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek
Lieber and Jos{\'e} E. Moreira and Henry S. {Warren,
title = "Dissecting {Cyclops}: a detailed analysis of a
multithreaded architecture",
journal = j-COMP-ARCH-NEWS,
volume = "31",
number = "1",
pages = "26--38",
month = mar,
year = "2003",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:37 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Bartosz Bali{\'s} and Marian Bubak and W{\l}odzimierz
Funika and Roland Wism{\"u}ller",
title = "A monitoring system for multithreaded applications",
journal = j-FUT-GEN-COMP-SYS,
volume = "19",
number = "5",
pages = "641--650",
month = jul,
year = "2003",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Sat Jan 10 10:03:34 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
remark = "Tools for Program Development and Analysis. Best
papers from two Technical Sessions, at ICCS2001, San
Francisco, CA, USA, and ICCS2002, Amsterdam, The
author = "Vasileios K. Barekas and Panagiotis E. Hadjidoukas and
Eleftherios D. Polychronopoulos and others",
title = "A Multiprogramming Aware {OpenMP} Implementation",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "133--141",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Ron Brightwell and Rolf Riesen and Arthur B. Maccabe",
title = "Design, Implementation, and Performance of {MPI} on
{Portals 3.0}",
journal = j-IJHPCA,
volume = "17",
number = "1",
pages = "7--20",
month = "Spring",
year = "2003",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Fri Nov 28 06:52:13 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
journal-URL = "http://hpc.sagepub.com/content/by/year",
author = "Sergio Briguglio and Beniamino Di Martino and Gregorio
title = "A performance-prediction model for {PIC} applications
on clusters of Symmetric MultiProcessors: Validation
with hierarchical {HPF $+$ OpenMP} implementation",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "159--176",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Steve Carr and Jean Mayo and Ching-Kuang Shene",
title = "{ThreadMentor}: a pedagogical tool for multithreaded
journal = j-JERIC,
volume = "3",
number = "1",
pages = "1--30",
month = mar,
year = "2003",
CODEN = "????",
ISSN = "1531-4278",
bibdate = "Tue Feb 3 18:43:37 MST 2004",
bibsource = "http://www.acm.org/pubs/contents/journals/jeric/;
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Educational Resources in Computing
author = "A. Chakravarti and X. Wang and J. Hallstrom and G.
booktitle = "Proceedings of the International Conference on
Parallel Processing",
title = "Implementation of Strong Mobility for Multi-threaded
Agents in {Java}",
publisher = "????",
address = "????",
pages = "321--332",
year = "2003",
CODEN = "????",
ISSN = "0190-3918",
bibdate = "Tue Dec 2 18:51:43 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
author = "Peng-Sheng Chen and Ming-Yu Hung and Yuan-Shin Hwang
and Roy Dz-Ching Ju and Jenq Kuen Lee",
title = "Compiler support for speculative multithreading
architecture with probabilistic points-to analysis",
journal = j-SIGPLAN,
pages = "25--36",
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 22 16:52:42 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Tamar Domani and Gal Goldshtein and Elliot K. Kolodner
and Ethan Lewis and Erez Petrank and Dafna Sheinwald",
title = "Thread-Local Heaps for {Java}",
journal = j-SIGPLAN,
volume = "38",
number = "2s",
pages = "183--194",
month = feb,
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:14 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Orit Edelstein and Eitan Farchi and Evgeny Goldin and
Yarden Nir and Gil Ratsaby and Shmuel Ur",
title = "Framework for testing multi-threaded {Java} programs",
journal = j-CCPE,
volume = "15",
number = "3--5",
pages = "485--499",
month = mar # "\slash " # apr,
year = "2003",
DOI = "https://doi.org/10.1002/cpe.654",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Jan 13 09:28:08 MST 2004",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "12 Feb 2003",
author = "Weijian Fang and Cho-Li Wang and Francis C. M. Lau",
title = "On the design of global object space for efficient
multi-threading {Java} computing on clusters",
volume = "29",
number = "11--12",
pages = "1563--1587",
month = nov # "\slash " # dec,
year = "2003",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Wed Dec 24 09:07:29 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "E. Gagnon and L. Hendren",
title = "Effective Inline-Threaded Interpretation of {Java}
Bytecode Using Preparation Sequences",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2622",
pages = "170--184",
year = "2003",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 15 07:54:18 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Nicholas I. M. Gould and Dominique Orban and Philippe
L. Toint",
title = "{GALAHAD}, a library of thread-safe {Fortran 90}
packages for large-scale nonlinear optimization",
journal = j-TOMS,
volume = "29",
number = "4",
pages = "353--372",
month = dec,
year = "2003",
DOI = "https://doi.org/10.1145/962437.962438",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Mon Jan 5 17:18:49 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We describe the design of version 1.0 of GALAHAD, a
library of Fortran 90 packages for large-scale
nonlinear optimization. The library particularly
addresses quadratic programming problems, containing
both interior point and active set algorithms, as well
as tools for preprocessing problems prior to solution.
It also contains an updated version of the venerable
nonlinear programming package, LANCELOT.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Dan Grossman",
title = "Type-safe multithreading in cyclone",
journal = j-SIGPLAN,
volume = "38",
number = "3",
pages = "13--25",
month = mar,
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu May 15 12:23:16 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "C. Heinlein",
title = "Advanced Thread Synchronization in {Java} Using
Interaction Expressions",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2591",
pages = "345--365",
year = "2003",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 1 06:09:06 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Haoqiang Jin and Gabriele Jost and Jerry Yan and
title = "Automatic multilevel parallelization using {OpenMP}",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "177--190",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
title = "{ParADE}: An {OpenMP} Programming Environment for
{SMP} Cluster Systems",
crossref = "ACM:2003:SII",
pages = "??--??",
year = "2003",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0;
abstract = "Demand for programming environments to exploit
clusters of symmetric multiprocessors (SMPs) is
increasing. In this paper, we present a new programming
environment, called ParADE, to enable easy, portable,
and high-performance programming on SMP clusters. It is
an OpenMP programming environment on top of a
multi-threaded software distributed shared memory
(SDSM) system with a variant of home-based lazy release
consistency protocol. To boost performance, the runtime
system provides explicit message-passing primitives to
make it a hybrid-programming environment. Collective
communication primitives are used for the
synchronization and work-sharing directives associated
with small data structures, lessening the
synchronization overhead and avoiding the implicit
barriers of work-sharing directives. The OpenMP
translator bridges the gap between the OpenMP
abstraction and the hybrid programming interfaces of
the runtime system. The experiments with several NAS
benchmarks and applications on a Linux-based cluster
show promising results that ParADE overcomes the
performance problem of the conventional SDSM-based
OpenMP environment.",
acknowledgement = ack-nhfb,
keywords = "hybrid programming; MPI; OpenMP; programming
environment; SMP cluster; software distributed shared
author = "Aaron W. Keen and Takashi Ishihara and Justin T. Maris
and Tiejun Li and Eugene F. Fodor and Ronald A.
title = "A comparison of concurrent programming and cooperative
journal = j-CCPE,
volume = "15",
number = "1",
pages = "27--53",
month = jan,
year = "2003",
DOI = "https://doi.org/10.1002/cpe.706",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Jan 13 09:28:05 MST 2004",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "6 Jan 2003",
author = "Jeremy Kepner",
title = "A multi-threaded fast convolver for dynamically
parallel image filtering",
journal = j-J-PAR-DIST-COMP,
volume = "63",
number = "3",
pages = "360--372",
month = mar,
year = "2003",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Tue Dec 16 16:10:40 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Scott Alan Klasky and Stephane Ethier and Zhihong Lin
and Kevin Martins and Doug McCune and Ravi Samtaney",
title = "Grid-Based Parallel Data Streaming implemented for the
Gyrokinetic Toroidal Code",
crossref = "ACM:2003:SII",
pages = "??--??",
year = "2003",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2;
abstract = "We have developed a threaded parallel data streaming
approach using Globus to transfer multi-terabyte
simulation data from a remote supercomputer to the
scientist's home analysis/visualization cluster, as the
simulation executes, with negligible overhead. Data
transfer experiments show that this concurrent data
transfer approach is more favorable compared with
writing to local disk and then transferring this data
to be post-processed. The present approach is conducive
to using the grid to pipeline the simulation with
post-processing and visualization. We have applied this
method to the Gyrokinetic Toroidal Code (GTC), a
3-dimensional particle-in-cell code used to study
micro-turbulence in magnetic confinement fusion from
first principles plasma theory.",
acknowledgement = ack-nhfb,
author = "Rainer Koster and Andrew P. Black and Jie Huang and
Jonathan Walpole and Calton Pu",
title = "Thread transparency in information flow middleware",
journal = j-SPE,
volume = "33",
number = "4",
pages = "321--349",
month = apr,
year = "2003",
DOI = "https://doi.org/10.1002/spe.510",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Sat Nov 29 17:39:44 MST 2003",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "19 Feb 2003",
author = "David Koufaty and Deborah T. Marr",
title = "Hyperthreading Technology in the Netburst
journal = j-IEEE-MICRO,
volume = "23",
number = "2",
pages = "56--65",
month = mar # "\slash " # apr,
year = "2003",
DOI = "https://doi.org/10.1109/MM.2003.1196115",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 23 18:57:11 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://dlib.computer.org/mi/books/mi2003/pdf/m2056.pdf;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack
Dongarra and Jens Volkert",
title = "Recent Advances in Parallel Virtual Machine and
Message Passing Interface (Select papers from the
{EuroPVMMPI 2002 Conference})",
journal = j-IJHPCA,
volume = "17",
number = "1",
pages = "3--5",
month = "Spring",
year = "2003",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Fri Nov 28 06:52:13 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
journal-URL = "http://hpc.sagepub.com/content/by/year",
author = "J. Kreuzinger and U. Brinkschulte and M. Pfeffer and
S. Uhrig and T. Ungerer",
title = "Real-time event-handling and scheduling on a
multithreaded {Java} microcontroller",
volume = "27",
number = "1",
pages = "19--31",
year = "2003",
ISSN = "0141-9331 (print), 1872-9436 (electronic)",
ISSN-L = "0141-9331",
bibdate = "Tue Feb 18 07:16:21 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Microprocessors and Microsystems",
author = "Yu-Kwong Kwok",
title = "On Exploiting Heterogeneity for Cluster Based Parallel
Multithreading Using Task Duplication",
volume = "25",
number = "1",
pages = "63--72",
month = may,
year = "2003",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Tue Dec 16 08:27:09 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/abstract.htm;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Heiko Mantel and Andrei Sabelfeld",
title = "A unifying approach to the security of distributed and
multi-threaded programs",
journal = j-J-COMP-SECUR,
volume = "11",
number = "4",
pages = "615--676",
month = "????",
year = "2003",
DOI = "https://doi.org/10.3233/JCS-2003-11406",
ISSN = "0926-227X (print), 1875-8924 (electronic)",
ISSN-L = "0926-227X",
bibdate = "Tue May 24 06:22:14 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computer Security",
journal-URL = "http://content.iospress.com/journals/journal-of-computer-security",
author = "Ami Marowka",
title = "Extending {OpenMP} for Task Parallelism",
volume = "13",
number = "3",
pages = "341--??",
month = sep,
year = "2003",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Sat Nov 6 18:06:31 MST 2004",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Timothy G. Mattson",
title = "How good is {OpenMP}",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "81--93",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Derek McAuley and Rolf Neugebauer",
title = "A case for virtual channel processors",
crossref = "ACM:2003:ATA",
pages = "237--242",
year = "2003",
DOI = "https://doi.org/10.1145/944747.944758",
bibdate = "Sat Oct 14 14:03:33 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Modern desktop and server computer systems use
multiple processors: general purpose CPU(s), graphic
processor (GPU), network processors (NP) on Network
Interface Cards (NICs), RAID controllers, and signal
processors on sound cards and modems. Some of these
processors traditionally have been special purpose
processors but there is a trend towards replacing some
of these with embedded general purpose processors. At
the same time main CPUs become more powerful; desktop
CPUs start featuring Simultaneous Multi-Threading
(SMT); and Symmetric Multi-Processing (SMP) systems are
widely used in server systems. However, the structure
of operating systems has not really changed to reflect
these trends --- different types of processors evolve
at different time scales (largely driven by market
forces) requiring significant changes to operating
systems kernels to reflect the appropriate tradeoffs.In
this position paper we propose to re-vitalise the old
idea of channel processors by encapsulating operating
system I/O subsystems in Virtual Channel Processors
(VCPs). VCPs perform I/O operations on behalf of an OS.
They provide similar development, performance, and
fault isolation as dedicated (embedded) I/O processors
do while offering the flexibility to split
functionality between the main processor(s) and
dedicated processors without affecting the rest of the
OS. If part of a VCP is executed on the main processor,
we propose to make use of virtual machine technology
and SMT/SMP features to isolate its performance from
that of the rest of the system and to protect the
system from faults within the VCP.",
acknowledgement = ack-nhfb,
author = "Luke K. McDowell and Susan J. Eggers and Steven D.
title = "Improving server software support for simultaneous
multithreaded processors",
journal = j-SIGPLAN,
pages = "37--48",
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 22 16:52:42 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Matthias S. M{\"u}ller",
title = "An {OpenMP} compiler benchmark",
journal = j-SCI-PROG,
volume = "11",
number = "2",
pages = "125--131",
year = "2003",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Mon Jan 12 06:28:15 MST 2004",
bibsource = "http://www.iospress.nl/site/html/10589244.html;
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "http://iospress.metapress.com/content/1058-9244",
author = "Kengo Nakajima",
title = "Parallel Iterative Solvers of {GeoFEM} with Selective
Blocking Preconditioning for Nonlinear Contact Problems
on the {Earth Simulator}",
crossref = "ACM:2003:SII",
pages = "??--??",
year = "2003",
bibdate = "Wed Nov 26 07:34:20 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1;
abstract = "An efficient parallel iterative method with selective
blocking preconditioning has been developed for
symmetric multiprocessor (SMP) cluster architectures
with vector processors such as the Earth Simulator.
This method is based on a three-level hybrid parallel
programming model, which includes message passing for
inter-SMP node communication, loop directives by OpenMP
for intra-SMP node parallelization and vectorization
for each processing element (PE). This method provides
robust and smooth convergence and excellent vector and
parallel performance in 3D geophysical simulations with
contact conditions performed on the Earth Simulator.
The selective blocking preconditioning is much more
efficient than ILU(1) and ILU(2). Performance for the
complicated Southwest Japan model with more than 23 M
DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was
161.7 GFLOPS, corresponding to 25.3\% of the peak
performance for hybrid programming model, and 190.4
GFLOPS (29.8\% of the peak performance) for flat MPI,
acknowledgement = ack-nhfb,
author = "James C. Pang and Gholamali C. Shoja and Eric G.
title = "Providing soft real-time quality of service guarantees
for {Java} threads",
journal = j-CCPE,
volume = "15",
number = "3--5",
pages = "521--538",
month = mar # "\slash " # apr,
year = "2003",
DOI = "https://doi.org/10.1002/cpe.663",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Jan 13 09:28:08 MST 2004",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "12 Feb 2003",
author = "Il Park and Babak Falsafi and T. N. Vijaykumar",
title = "Implicitly-multithreaded processors",
journal = j-COMP-ARCH-NEWS,
volume = "31",
number = "2",
pages = "39--51",
month = may,
year = "2003",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:51 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "C. Petitpierre",
title = "{Java} Threads Can Be Very Useful Building Blocks",
journal = j-LECT-NOTES-COMP-SCI,
volume = "2604",
pages = "204",
year = "2003",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 1 06:09:06 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Ruben Pinilla and Marisa Gil",
title = "{ULT}: a {Java} threads model for platform independent
journal = j-OPER-SYS-REV,
volume = "37",
number = "4",
pages = "48--62",
month = oct,
year = "2003",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:53 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Eli Pozniansky and Assaf Schuster",
title = "Efficient on-the-fly data race detection in
multithreaded {C++} programs",
journal = j-SIGPLAN,
pages = "179--190",
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 22 16:52:42 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Manohar K. Prabhu and Kunle Olukotun",
title = "Using thread-level speculation to simplify manual
journal = j-SIGPLAN,
pages = "1--12",
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 22 16:52:42 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Gregory W. Price and David K. Lowenthal",
title = "A comparative analysis of fine-grain threads
journal = j-J-PAR-DIST-COMP,
volume = "63",
number = "11",
pages = "1050--1063",
month = nov,
year = "2003",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Tue Dec 16 16:10:44 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Milos Prvulovic and Josep Torrellas",
title = "{ReEnact}: using thread-level speculation mechanisms
to debug data races in multithreaded codes",
journal = j-COMP-ARCH-NEWS,
volume = "31",
number = "2",
pages = "110--121",
month = may,
year = "2003",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:51 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Ravi Rajwar and James Goodman",
title = "Transactional Execution: Toward Reliable,
High-Performance Multithreading",
journal = j-IEEE-MICRO,
volume = "23",
number = "6",
pages = "117--125",
month = nov # "\slash " # dec,
year = "2003",
DOI = "https://doi.org/10.1109/MM.2003.1261395",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Sat Jan 31 07:23:55 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
URL = "http://csdl.computer.org/comp/mags/mi/2003/06/m6117abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Kay A. Robbins and Steven Robbins",
title = "{UNIX} Systems programming: communication,
concurrency, and threads",
publisher = pub-PHPTR,
address = pub-PHPTR:adr,
edition = "Second",
pages = "xvii + 893",
year = "2003",
ISBN = "0-13-042411-0",
ISBN-13 = "978-0-13-042411-2",
LCCN = "QA76.76.O63 R6215 2003",
bibdate = "Wed Aug 20 21:08:15 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
annote = "See \cite{Robbins:1996:PUP} for first edition.",
keywords = "operating systems (computers); UNIX (computer file)",
author = "Arch D. Robison",
title = "Memory Consistency and {.NET}",
journal = j-DDJ,
volume = "28",
number = "4",
pages = "46, 48--50",
month = apr,
year = "2003",
ISSN = "1044-789X",
bibdate = "Thu Jun 12 05:46:22 MDT 2003",
bibsource = "http://www.ddj.com/articles/2003/0304/;
URL = "http://www.ddj.com/documents/s=7827/ddj0304e/",
abstract = "Understanding the basics of memory consistency is
essential to writing multithreaded code that works on
both uniprocessors and multiprocessors.",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Yan Solihin and Jaejin Lee and Josep Torrellas",
title = "Correlation Prefetching with a User-Level Memory
volume = "14",
number = "6",
pages = "563--580",
month = jun,
year = "2003",
DOI = "https://doi.org/10.1109/TPDS.2003.1206504",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Wed Dec 24 10:02:07 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/trans/td/2003/06/l0563abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Steven Swanson and Luke K. McDowell and Michael M.
Swift and Susan J. Eggers and Henry M. Levy",
title = "An evaluation of speculative instruction execution on
simultaneous multithreaded processors",
journal = j-TOCS,
volume = "21",
number = "3",
pages = "314--340",
month = aug,
year = "2003",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Thu Aug 7 10:13:26 MDT 2003",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
author = "Ruppa K. Thulasiram and Parimala Thulasiraman",
title = "Performance Evaluation of a Multithreaded {Fast
Fourier Transform} Algorithm for Derivative Pricing",
volume = "26",
number = "1",
pages = "43--58",
month = aug,
year = "2003",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Tue Dec 16 08:27:10 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/abstract.htm;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Martin Timmerman",
title = "Examining {Windows CE .NET}",
journal = j-DDJ,
volume = "28",
number = "2",
pages = "62, 64",
month = feb,
year = "2003",
ISSN = "1044-789X",
bibdate = "Thu Jun 12 05:46:21 MDT 2003",
bibsource = "http://www.ddj.com/articles/2003/0302/;
URL = "http://www.ddj.com/documents/s=7790/ddj0302h/",
abstract = "Martin examines Windows CE .NET's thread handling and
advanced interrupt handling capabilities, as well as
its synchronization mechanisms and network stack
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "G. Tremblay and C. J. Morrone and J. N. Amaral and G.
R. Gao",
title = "Implementation of the {EARTH} programming model on
{SMP} clusters: a multi-threaded language and runtime
journal = j-CCPE,
volume = "15",
number = "9",
pages = "821--844",
day = "10",
month = aug,
year = "2003",
DOI = "https://doi.org/10.1002/cpe.729",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Jan 13 09:28:12 MST 2004",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "14 Jul 2003",
author = "Y. Tseng and R. F. DeMara and P. J. Wilder",
title = "Distributed-sum termination detection supporting
multithreaded execution",
volume = "29",
number = "7",
pages = "953--968",
month = jul,
year = "2003",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Wed Dec 24 09:07:26 MST 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij
title = "A survey of processors with explicit multithreading",
journal = j-COMP-SURV,
volume = "35",
number = "1",
pages = "29--63",
month = mar,
year = "2003",
DOI = "https://doi.org/10.1145/641865.641867",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Thu Jun 19 10:18:52 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/surveys/;
abstract = "Hardware multithreading is becoming a generally
applied technique in the next generation of
microprocessors. Several multithreaded processors are
announced by industry or already into production in the
areas of high-performance microprocessors, media, and
network processors. A multithreaded processor is able
to pursue two or more threads of control in parallel
within the processor pipeline. The contexts of two or
more threads of control are often stored in separate
on-chip register sets. Unused instruction slots, which
arise from latencies during the pipelined execution of
single-threaded programs by a contemporary
microprocessor, are filled by instructions of other
threads within a multithreaded processor. The execution
units are multiplexed between the thread contexts that
are loaded in the register sets. Underutilization of a
superscalar processor due to missing instruction-level
parallelism can be overcome by simultaneous
multithreading, where a processor can issue multiple
instructions from multiple threads each cycle.
Simultaneous multithreaded processors combine the
multithreading technique with a wide-issue superscalar
processor to utilize a larger part of the issue
bandwidth by issuing instructions from different
threads simultaneously. Explicit multithreaded
processors are multithreaded processors that apply
processes or operating system threads in their hardware
thread slots. These processors optimize the throughput
of multiprogramming workloads rather than single-thread
performance. We distinguish these processors from
implicit multithreaded processors that utilize
thread-level speculation by speculatively executing
compiler- or machine-generated threads of control that
are part of a single sequential program. This survey
paper explains and classifies the explicit
multithreading techniques in research and in commercial
acknowledgement = ack-nhfb,
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
keywords = "Blocked multithreading; interleaved multithreading;
simultaneous multithreading",
author = "Christoph von Praun and Thomas R. Gross",
title = "Static conflict analysis for multi-threaded
object-oriented programs",
journal = j-SIGPLAN,
volume = "38",
number = "5",
pages = "115--128",
month = may,
year = "2003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Oct 11 12:45:00 MDT 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Panit Watcharawitch and Simon W. Moore",
title = "{MulTEP}: {MulTithreaded Embedded Processors}",
crossref = "Anonymous:2003:CCV",
pages = "??--??",
year = "2003",
bibdate = "Fri Jan 09 17:02:42 2004",
bibsource = "http://www.coolchips.org/cool6/pdfDocuments/WEB05-Program_COOL6_2003.4.1.pdf;
acknowledgement = ack-nhfb,
author = "Tom White",
title = "Using Thread-Local Variables In {Java}",
journal = j-DDJ,
volume = "28",
number = "7",
pages = "42, 44--46",
month = jul,
year = "2003",
ISSN = "1044-789X",
bibdate = "Thu Jun 12 05:46:24 MDT 2003",
bibsource = "http://www.ddj.com/articles/2003/0307/;
URL = "http://www.ddj.com/ftp/2003/2003_07/thread.txt;
abstract = "Java's ThreadLocal class provides a powerful,
easy-to-use way to write efficient code that is safe
for multithreaded access. Additional resources include
thread.txt (listings) and thread.zip (source code).",
acknowledgement = ack-nhfb,
fjournal = "Dr. Dobb's Journal of Software Tools",
author = "Xie Yong and Hsu Wen-Jing",
title = "Aligned Multithreaded Computations and Their
Scheduling with {FAB} Performance Guarantees",
volume = "13",
number = "3",
pages = "353--??",
month = sep,
year = "2003",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Jan 06 09:41:03 2005",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Anasua Bhowmik and Manoj Franklin",
title = "A General Compiler Framework for Speculative
Multithreaded Processors",
volume = "15",
number = "8",
pages = "713--724",
month = aug,
year = "2004",
DOI = "https://doi.org/10.1109/TPDS.2004.26",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Sat Dec 11 16:24:15 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/dl/trans/td/2004/08/l0713.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "S. Bouchenak and D. Hagimont and S. Krakowiak and N.
De Palma and F. Boyer",
title = "Experiences implementing efficient {Java} thread
serialization, mobility and persistence",
journal = j-SPE,
volume = "34",
number = "4",
pages = "355--393",
day = "10",
month = apr,
year = "2004",
DOI = "https://doi.org/10.1002/spe.569",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Sat Apr 16 07:26:28 MDT 2005",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "5 Jan 2004",
author = "H. M. Bucker and B. Lang and H. J. Pflug and A.
title = "Threads in an Undergraduate Course: a {Java} Example
Illuminating Different Multithreading Approaches",
journal = j-LECT-NOTES-COMP-SCI,
volume = "3044",
pages = "882--891",
year = "2004",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Sep 28 15:27:39 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "B. M. Chang and J. D. Choi",
title = "Thread-Sensitive Points-to Analysis for Multithreaded
{Java} Programs",
journal = j-LECT-NOTES-COMP-SCI,
volume = "3280",
pages = "945--954",
year = "2004",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Mon Dec 6 06:44:22 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Mainak Chaudhuri and Mark Heinrich",
title = "{SMTp}: {An Architecture} for {Next-generation
Scalable Multi-threading}",
journal = j-COMP-ARCH-NEWS,
volume = "32",
number = "2",
pages = "124--124",
month = mar,
year = "2004",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:45 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Cormac Flanagan and Stephen N. Freund",
title = "Atomizer: a dynamic atomicity checker for
multithreaded programs",
journal = j-SIGPLAN,
volume = "39",
number = "1",
pages = "256--267",
month = jan,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Apr 12 09:38:12 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Cormac Flanagan and Stephen N. Freund and Shaz
title = "Exploiting purity for atomicity",
journal = j-SIGSOFT,
volume = "29",
number = "4",
pages = "221--231",
month = jul,
year = "2004",
DOI = "https://doi.org/10.1145/1013886.1007543",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:14:35 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The notion that certain procedures are atomic is a
fundamental correctness property of many multithreaded
software systems. A procedure is atomic if for every
execution there is an equivalent serial execution in
which the actions performed by any thread while
executing the atomic procedure are not interleaved with
actions of other threads. Several existing tools verify
atomicity by using commutativity of actions to show
that every execution reduces to a corresponding serial
execution. However, experiments with these tools have
highlighted a number of interesting procedures that,
while intuitively atomic, are not reducible. In this
paper, we exploit the notion of pure code blocks to
verify the atomicity of such irreducible procedures. If
a pure block terminates normally, then its evaluation
does not change the program state, and hence these
evaluation steps can be removed from the program trace
before reduction. We develop a static analysis for
atomicity based on this insight, and we illustrate this
analysis on a number of interesting examples that could
not be verified using earlier tools based purely on
reduction. The techniques developed in this paper may
also be applicable in other approaches for verifying
atomicity, such as model checking and dynamic
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "A. Georges and M. Christiaens and M. Ronsse and K. {De
title = "{JaRec}: a portable record\slash replay environment
for multi-threaded {Java} applications",
journal = j-SPE,
volume = "34",
number = "6",
pages = "523--547",
month = may,
year = "2004",
DOI = "https://doi.org/10.1002/spe.579",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Sat Apr 16 07:26:29 MDT 2005",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "24 Feb 2004",
author = "Troy A. Johnson and Rudolf Eigenmann and T. N.
title = "Min-cut program decomposition for thread-level
journal = j-SIGPLAN,
volume = "39",
number = "6",
pages = "59--70",
month = may,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Dec 2 05:49:55 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Wesley M. Johnston and J. R. Paul Hanna and Richard J.
title = "Advances in dataflow programming languages",
journal = j-COMP-SURV,
volume = "36",
number = "1",
pages = "1--34",
month = mar,
year = "2004",
DOI = "https://doi.org/10.1145/1013208.1013209",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Thu Jun 19 10:19:47 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/surveys/;
abstract = "Many developments have taken place within dataflow
programming languages in the past decade. In
particular, there has been a great deal of activity and
advancement in the field of dataflow visual programming
languages. The motivation for this article is to review
the content of these recent developments and how they
came about. It is supported by an initial review of
dataflow programming in the 1970s and 1980s that led to
current topics of research. It then discusses how
dataflow programming evolved toward a hybrid von
Neumann dataflow formulation, and adopted a more
coarse-grained approach. Recent trends toward dataflow
visual programming languages are then discussed with
reference to key graphical dataflow languages and their
development environments. Finally, the article details
four key open topics in dataflow programming
acknowledgement = ack-nhfb,
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
keywords = "co-ordination languages; component software; data flow
visual programming; Dataflow; graphical programming;
multithreading; software engineering",
author = "Ron Kalla and Balaram Sinharoy and Joel M. Tendler",
title = "{IBM Power5} Chip: a Dual-Core Multithreaded
journal = j-IEEE-MICRO,
volume = "24",
number = "2",
pages = "40--47",
month = mar # "\slash " # apr,
year = "2004",
DOI = "https://doi.org/10.1109/MM.2004.1289290",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Sat Dec 11 17:59:16 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2040abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Sanjiv Kapil and Harlan McGhan and Jesse Lawrendra",
title = "A Chip Multithreaded Processor for Network-Facing
journal = j-IEEE-MICRO,
volume = "24",
number = "2",
pages = "20--30",
month = mar # "\slash " # apr,
year = "2004",
DOI = "https://doi.org/10.1109/MM.2004.1289288",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Sat Dec 11 17:59:16 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2020abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
title = "Memory management for multi-threaded software {DSM}
volume = "30",
number = "1",
pages = "121--138",
month = jan,
year = "2004",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Sun Nov 7 05:53:52 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Ronny Krashinsky and Christopher Batten and Mark
Hampton and Steve Gerding and Brian Pharris and Jared
Casper and Krste Asanovic",
title = "The Vector-Thread Architecture",
journal = j-COMP-ARCH-NEWS,
volume = "32",
number = "2",
pages = "52--52",
month = mar,
year = "2004",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:45 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Ronny Krashinsky and Christopher Batten and Mark
Hampton and Steve Gerding and Brian Pharris and Jared
Casper and Krste Asanovic",
title = "The Vector-Thread Architecture",
journal = j-IEEE-MICRO,
volume = "24",
number = "6",
pages = "84--90",
month = nov # "\slash " # dec,
year = "2004",
DOI = "https://doi.org/10.1109/MM.2004.90",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 20 08:11:28 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6084.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Nagendra J. Kumar and Siddhartha Shivshankar and
Alexander G. Dean",
title = "Asynchronous software thread integration for efficient
journal = j-SIGPLAN,
volume = "39",
number = "7",
pages = "37--46",
month = jul,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Dec 2 05:49:55 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Rakesh Kumar and Dean M. Tullsen and Parthasarathy
Ranganathan and Norman P. Jouppi and Keith I. Farkas",
title = "Single-{ISA} Heterogeneous Multi-Core Architectures
for Multithreaded Workload Performance",
journal = j-COMP-ARCH-NEWS,
volume = "32",
number = "2",
pages = "64--64",
month = mar,
year = "2004",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:45 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Oliver Lemon and Alexander Gruenstein",
title = "Multithreaded context for robust conversational
interfaces: {Context-sensitive} speech recognition and
interpretation of corrective fragments",
journal = j-TOCHI,
volume = "11",
number = "3",
pages = "241--267",
month = sep,
year = "2004",
ISSN = "1073-0516 (print), 1557-7325 (electronic)",
ISSN-L = "1073-0516",
bibdate = "Thu Nov 4 08:26:36 MST 2004",
bibsource = "http://www.acm.org/pubs/contents/journals/tochi/;
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Computer-Human Interaction",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756",
author = "S. Q. Li and H. Y. Chen and Y. X. Su",
title = "A Framework of Reachability Testing for {Java}
Multithread Programs",
journal = "IEEE International Conference on Systems Man and
volume = "3",
pages = "2730--2734",
year = "2004",
CODEN = "????",
ISSN = "1062-922X",
bibdate = "Thu Mar 24 17:43:34 MST 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
author = "Justin T. Maris and Aaron W. Keen and Takashi Ishihara
and Ronald A. Olsson",
title = "A comparison of concurrent programming and cooperative
multithreading under load balancing applications",
journal = j-CCPE,
volume = "16",
number = "4",
pages = "345--369",
day = "10",
month = apr,
year = "2004",
DOI = "https://doi.org/10.1002/cpe.751",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat May 14 11:30:53 MDT 2005",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "19 Jan 2004",
author = "Ami Marowka and Zhenying Liu and Barbara Chapman",
title = "{OpenMP}-oriented applications for distributed shared
memory architectures",
journal = j-CCPE,
volume = "16",
number = "4",
pages = "371--384",
day = "10",
month = apr,
year = "2004",
DOI = "https://doi.org/10.1002/cpe.752",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat May 14 11:30:53 MDT 2005",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "19 Jan 2004",
author = "Mar{\'\i}a J. Mart{\'\i}n and Marta Parada and
Ram{\'o}n Doallo",
title = "High Performance Air Pollution Simulation Using
volume = "28",
number = "3",
pages = "311--321",
month = jun,
year = "2004",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Sat Dec 4 12:39:13 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/5189/I/54/A/5/abstract.htm",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Maged M. Michael",
title = "Scalable lock-free dynamic memory allocation",
journal = j-SIGPLAN,
volume = "39",
number = "6",
pages = "35--46",
month = may,
year = "2004",
DOI = "https://doi.org/10.1145/996841.996848",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Dec 2 05:49:55 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Dynamic memory allocators (malloc/free) rely on mutual
exclusion locks for protecting the consistency of their
shared data structures under multithreading. The use of
locking has many disadvantages with respect to
performance, availability, robustness, and programming
flexibility. A lock-free memory allocator guarantees
progress regardless of whether some threads are delayed
or even killed and regardless of scheduling policies.
This paper presents a completely lock-free memory
allocator. It uses only widely-available operating
system support and hardware atomic instructions. It
offers guaranteed availability even under arbitrary
thread termination and crash-failure, and it is immune
to deadlock regardless of scheduling policies, and
hence it can be used even in interrupt handlers and
real-time applications without requiring special
scheduler support. Also, by leveraging some high-level
structures from Hoard, our allocator is highly
scalable, limits space blowup to a constant factor, and
is capable of avoiding false sharing. In addition, our
allocator allows finer concurrency and much lower
latency than Hoard. We use PowerPC shared memory
multiprocessor systems to compare the performance of
our allocator with the default AIX 5.1 libc malloc, and
two widely-used multithread allocators, Hoard and
Ptmalloc. Our allocator outperforms the other
allocators in virtually all cases and often by
substantial margins, under various levels of
parallelism and allocation patterns. Furthermore, our
allocator also offers the lowest contention-free
latency among the allocators by significant margins.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "M. Omma",
title = "On building multithreaded applications",
volume = "5",
number = "4",
pages = "1--3",
month = apr,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/MDSO.2004.1301256",
ISSN = "1541-4922 (print), 1558-1683 (electronic)",
ISSN-L = "1541-4922",
bibdate = "Fri Jul 15 17:50:15 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://ieeexplore.ieee.org/iel5/8968/28913/01301256.pdf?isnumber=28913&prod=JNL&arnumber=1301256&arSt=+1&ared=+3&arAuthor=Omma%2C+M.;
acknowledgement = ack-nhfb,
fjournal = "IEEE Distributed Systems Online",
author = "M. Pfeffer and T. Ungerer and S. Fuhrmann and J.
Kreuzinger and U. Brinkschulte",
title = "Real-Time Garbage Collection for a Multithreaded
{Java} Microcontroller",
journal = j-REAL-TIME-SYST,
volume = "26",
number = "1",
pages = "89--106",
year = "2004",
ISSN = "0922-6443",
ISSN-L = "0922-6443",
bibdate = "Mon Jan 5 17:25:38 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Real-Time Systems",
author = "B. Robatmili and N. Yazdani and S. Sardashti and M.
title = "Thread-Sensitive Instruction Issue for {SMT}
volume = "3",
number = "1",
pages = "5--5",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Simultaneous Multi Threading (SMT) is a processor
design method in which concurrent hardware threads
share processor resources like functional units and
memory. The scheduling complexity and performance of an
SMT processor depend on the topology used in the fetch
and issue stages. In this paper, we propose a thread
sensitive issue policy for a partitioned SMT processor
which is based on a thread metric. We propose the
number of ready-to-issue instructions of each thread as
priority metric. To evaluate our method, we have
developed a reconfigurable SMT-simulator on top of the
SimpleScalar Toolset. We simulated our modeled
processor under several workloads composed of SPEC
benchmarks. Experimental results show around 30\%
improvement compared to the conventional OLDEST\_FIRST
mixed topology issue policy. Additionally, the hardware
implementation of our architecture with this metric in
issue stage is quite simple.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Delay; Frequency; Intrusion detection;
Laboratories; Logic; Processor scheduling;
Surface-mount technology; Topology",
author = "Marcus Roth and Gerrit Voss and Dirk Reiners",
title = "Multi-threading and clustering for scene graph
volume = "28",
number = "1",
pages = "63--66",
month = feb,
year = "2004",
ISSN = "0097-8493 (print), 1873-7684 (electronic)",
ISSN-L = "0097-8493",
bibdate = "Tue Jan 27 12:04:28 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Computers and Graphics",
journal-URL = "http://www.sciencedirect.com/science/journal/00978493",
author = "B. Sanden",
title = "Coping with {Java} Threads: {Java} works for many
kinds of concurrent software, but it was not designed
for safety-critical real-time applications and does not
protect the programmer from the pitfalls associated
with multithreading",
journal = j-COMPUTER,
volume = "37",
number = "4",
pages = "20--27",
year = "2004",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Mon May 17 14:50:36 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot",
title = "The Need for Adaptive Dynamic Thread Scheduling in
Simultaneous Multithreading",
volume = "14",
number = "3/4",
pages = "327--??",
month = sep # "\slash " # dec,
year = "2004",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Jul 7 07:41:25 MDT 2005",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Parimala Thulasiraman and Ashfaq A. Khokhar and Gerd
Heber and Guang R. Gao",
title = "A fine-grain load-adaptive algorithm of the {$2$D}
discrete wavelet transform for multithreaded
journal = j-J-PAR-DIST-COMP,
volume = "64",
number = "1",
pages = "68--78",
month = jan,
year = "2004",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Dec 4 15:15:08 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Andrew Tolmach and Sergio Antoy and Marius Nita",
title = "Implementing functional logic languages using multiple
threads and stores",
journal = j-SIGPLAN,
volume = "39",
number = "9",
pages = "90--102",
month = sep,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Dec 2 05:49:56 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "A. Vrenios",
title = "{Parallel Programming in C with MPI and OpenMP} [Book
volume = "5",
number = "1",
pages = "7.1--7.3",
month = "????",
year = "2004",
CODEN = "????",
ISSN = "1541-4922 (print), 1558-1683 (electronic)",
ISSN-L = "1541-4922",
bibdate = "Fri Jul 15 17:50:13 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://ieeexplore.ieee.org/iel5/8968/28452/01270716.pdf?isnumber=28452&prod=JNL&arnumber=1270716&arSt=+7.1&ared=+7.3&arAuthor=Vrenios%2C+A.;
acknowledgement = ack-nhfb,
fjournal = "IEEE Distributed Systems Online",
author = "Perry H. Wang and Jamison D. Collins and Hong Wang and
Dongkeun Kim and Bill Greene and Kai-Ming Chan and
Aamir B. Yunus and Terry Sych and Stephen F. Moore and
John P. Shen",
title = "Helper threads via virtual multithreading on an
experimental {Itanium-2} processor-based platform",
journal = j-COMP-ARCH-NEWS,
volume = "32",
number = "5",
pages = "144--155",
month = dec,
year = "2004",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:24 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Perry H. Wang and Jamison D. Collins and Hong Wang and
Dongkeun Kim and Bill Greene and Kai-Ming Chan and
Aamir B. Yunus and Terry Sych and Stephen F. Moore and
John P. Shen",
title = "Helper threads via virtual multithreading on an
experimental {Itanium-2} processor-based platform",
journal = j-SIGPLAN,
volume = "39",
number = "11",
pages = "144--155",
month = nov,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Apr 12 09:38:13 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Perry H. Wang and Jamison D. Collins and Hong Wang and
Dongkeun Kim and Bill Greene and Kai-Ming Chan and
Aamir B. Yunus and Terry Sych and Stephen F. Moore and
John P. Shen",
title = "Helper threads via virtual multithreading on an
experimental {Itanium-2} processor-based platform",
journal = j-OPER-SYS-REV,
volume = "38",
number = "5",
pages = "144--155",
month = dec,
year = "2004",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:56 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Perry H. Wang and Jamison D. Collins and Hong Wang and
Dongkeun Kim and Bill Greene and Kai-Ming Chan and
Aamir B. Yunus and Terry Sych and Stephen F. Moore and
John P. Shen",
title = "Helper Threads via Virtual Multithreading",
journal = j-IEEE-MICRO,
volume = "24",
number = "6",
pages = "74--82",
month = nov # "\slash " # dec,
year = "2004",
DOI = "https://doi.org/10.1109/MM.2004.75",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 20 08:11:28 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6074.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Xiaotong Zhuang and Santosh Pande",
title = "Balancing register allocation across threads for a
multithreaded network processor",
journal = j-SIGPLAN,
volume = "39",
number = "6",
pages = "289--300",
month = may,
year = "2004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Dec 2 05:49:55 MST 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Erika {\'A}brah{\'a}m and Frank S. de Boer and
Willem-Paul de Roever and Martin Steffen",
title = "An assertion-based proof system for multithreaded
journal = j-THEOR-COMP-SCI,
volume = "331",
number = "2--3",
pages = "251--290",
day = "25",
month = feb,
year = "2005",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Fri Jul 8 14:05:15 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "Anonymous",
title = "Errata: {{\em Characterization of Simultaneous
Multithreading (SMT) Efficiency in POWER5}}",
journal = j-IBM-JRD,
volume = "49",
number = "6",
pages = "1003--??",
month = nov,
year = "2005",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Fri Feb 9 21:39:23 MST 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "See \cite{Mathis:2005:CSM}.",
URL = "http://www.research.ibm.com/journal/rd/496/errata.html",
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
ordernumber = "G322-0245-00",
author = "Katherine Barabash and Ori Ben-Yitzhak and Irit Goft
and Elliot K. Kolodner and Victor Leikehman and Yoav
Ossia and Avi Owshanko and Erez Petrank",
title = "A parallel, incremental, mostly concurrent garbage
collector for servers",
journal = j-TOPLAS,
volume = "27",
number = "6",
pages = "1097--1146",
month = nov,
year = "2005",
DOI = "https://doi.org/10.1145/1108970.1108972",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Wed Jan 11 05:23:15 MST 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Multithreaded applications with multigigabyte heaps
running on modern servers provide new challenges for
garbage collection (GC). The challenges for
``server-oriented'' GC include: ensuring short pause
times on a multigigabyte heap while minimizing
throughput penalty, good scaling on multiprocessor
hardware, and keeping the number of expensive
multicycle fence instructions required by weak ordering
to a minimum. We designed and implemented a collector
facing these demands building on the mostly concurrent
garbage collector proposed by Boehm et al. [1991]. Our
collector incorporates new ideas into the original
collector. We make it parallel and incremental; we
employ concurrent low-priority background GC threads to
take advantage of processor idle time; we propose novel
algorithmic improvements to the basic mostly concurrent
algorithm improving its efficiency and shortening its
pause times; and finally, we use advanced techniques,
such as a low-overhead work packet mechanism to enable
full parallelism among the incremental and concurrent
collecting threads and ensure load balancing. We
compared the new collector to the mature,
well-optimized, parallel, stop-the-world mark-sweep
collector already in the IBM JVM. When allowed to run
aggressively, using 72\% of the CPU utilization during
a short concurrent phase, our collector prototype
reduces the maximum pause time from 161 ms to 46 ms
while only losing 11.5\% throughput when running the
SPECjbb2000 benchmark on a 600-MB heap on an 8-way
PowerPC 1.1-GHz processors. When the collector is
limited to a nonintrusive operation using only 29\% of
the CPU utilization, the maximum pause time obtained is
79 ms and the loss in throughput is 15.4\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Ramzi Basharahil and Brian Wims and Cheng-Zhong Xu and
Song Fu",
title = "Distributed Shared Arrays: An Integration of Message
Passing and Multithreading on {SMP} Clusters",
volume = "31",
number = "2",
pages = "161--184",
month = feb,
year = "2005",
DOI = "https://doi.org/10.1007/s11227-005-0041-5",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Jul 6 10:36:19 MDT 2005",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=31&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=31&issue=2&spage=161",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Hans-J. Boehm",
title = "Threads cannot be implemented as a library",
journal = j-SIGPLAN,
volume = "40",
number = "6",
pages = "261--268",
month = jun,
year = "2005",
DOI = "https://doi.org/10.1145/1065010.1065042",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 21 17:04:05 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In many environments, multi-threaded code is written
in a language that was originally designed without
thread support (e.g. C), to which a library of
threading primitives was subsequently added. There
appears to be a general understanding that this is not
the right approach. We provide specific arguments that
a pure library approach, in which the compiler is
designed independently of threading issues, cannot
guarantee correctness of the resulting code. We first
review why the approach almost works, and then examine
some of the surprising behavior it may entail. We
further illustrate that there are very simple cases in
which a pure library-based approach seems incapable of
expressing an efficient parallel algorithm. Our
discussion takes place in the context of C with
Pthreads, since it is commonly used, reasonably well
specified, and does not attempt to ensure type-safety,
which would entail even stronger constraints. The
issues we raise are not specific to that context.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "This is an important paper: it shows that current
languages cannot be reliable for threaded programming
without language changes that prevent compiler
optimizations from foiling synchronization methods and
memory barriers. The article's author and others are
collaborating on a proposal for changes to the C++
language to remedy this, but that still leaves threads
unreliable in C code, even with POSIX threads.",
author = "S. Boroday and A. Petrenko and J. Singh and H.
title = "Dynamic analysis of {Java} applications for
multithreaded antipatterns",
journal = j-SIGSOFT,
volume = "30",
number = "4",
pages = "1--7",
month = jul,
year = "2005",
DOI = "https://doi.org/10.1145/1082983.1083247",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:14:51 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
abstract = "Formal verification is not always applicable to large
industrial software systems due to scalability issues
and difficulties in formal model and requirements
specification. The scalability and model derivation
problems could be alleviated by runtime trace analysis,
which combines both testing and formal verification. We
implement and compare an ad-hoc custom approach and a
formal approach to detect common bug patterns in
multithreaded Java software. We use the tracing
platform of the Eclipse IDE and state-of-the-art model
checker Spin.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "U. Brinkschulte and M. Pacher",
title = "Implementing Control Algorithms Within a Multithreaded
{Java} Microcontroller",
journal = j-LECT-NOTES-COMP-SCI,
volume = "3432",
pages = "33--49",
year = "2005",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
ISSN-L = "0302-9743",
bibdate = "Tue Apr 26 10:50:23 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
Ingenta database",
acknowledgement = ack-nhfb,
fjournal = "Lecture Notes in Computer Science",
author = "Theofanis Constantinou and Yiannakis Sazeides and
Pierre Michaud and Damien Fetis and Andre Seznec",
title = "Performance implications of single thread migration on
a chip multi-core",
journal = j-COMP-ARCH-NEWS,
volume = "33",
number = "4",
pages = "80--91",
month = nov,
year = "2005",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:08 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Cormac Flanagan and Stephen N. Freund and Shaz Qadeer
and Sanjit A. Seshia",
title = "Modular verification of multithreaded programs",
journal = j-THEOR-COMP-SCI,
volume = "338",
number = "1--3",
pages = "153--183",
day = "10",
month = jun,
year = "2005",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Fri Jul 8 14:05:16 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "P. Garcia and H. F. Korth",
title = "Hash-join algorithms on modern multithreaded computer
type = "Report",
number = "LUCSE-05-001",
institution = "Lehigh University",
address = "Bethlehem, PA, USA",
month = "????",
year = "2005",
bibdate = "Mon Dec 10 07:05:38 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
acknowledgement = ack-nhfb,
author = "M. E. Giampapa and R. Bellofatto and M. A. Blumrich
and D. Chen and M. B. Dombrowa and A. Gara and R. A.
Haring and P. Heidelberger and D. Hoenicke and G. V.
Kopcsay and B. J. Nathanson and B. D. Steinmacher-Burow
and M. Ohmacht and V. Salapura and P. Vranas",
title = "{Blue Gene/L} advanced diagnostics environment",
journal = j-IBM-JRD,
volume = "49",
number = "2/",
pages = "319--331",
month = "????",
year = "2005",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Wed Jun 1 08:14:41 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/492/giampapa.pdf",
abstract = "This paper describes the Blue Gene/L advanced
diagnostics environment (ADE) used throughout all
aspects of the Blue Gene/L project, including design,
logic verification, bringup, diagnostics, and
manufacturing test. The Blue Gene/L ADE consists of a
lightweight multithreaded coherence-managed kernel,
runtime libraries, device drivers, system programming
interfaces, compilers, and host-based development
tools. It provides complete and flexible access to all
features of the Blue Gene/L hardware. Prior to the
existence of hardware, ADE was used on Very high-speed
integrated circuit Hardware Description Language (VHDL)
models, not only for logic verification, but also for
performance measurements, code-path analysis, and
evaluation of architectural tradeoffs. During early
hardware bring-up, the ability to run in a
cycle-reproducible manner on both hardware and VHDL
proved invaluable in fault isolation and analysis.
However, ADE is also capable of supporting
high-performance applications and parallel test cases,
thereby permitting us to stress the hardware to the
limits of its capabilities. This paper also provides
insights into system-level and device-level programming
of Blue Gene/L to assist developers of high-performance
applications to more fully exploit the performance of
the machine.",
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
ordernumber = "G322-0240",
author = "Marisa Gil and Ruben Pinilla",
title = "Thread coloring: a scheduler proposal from user to
hardware threads",
journal = j-OPER-SYS-REV,
volume = "39",
number = "2",
pages = "54--70",
month = apr,
year = "2005",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "Andreas Gustafsson",
title = "Threads without the pain",
journal = j-QUEUE,
volume = "3",
number = "9",
pages = "42--47",
month = nov,
year = "2005",
ISSN = "1542-7730 (print), 1542-7749 (electronic)",
ISSN-L = "1542-7730",
bibdate = "Sat Dec 17 07:37:28 MST 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Queue: Tomorrow's Computing Today",
author = "J{\"o}rg Keller and Andreas Gr{\"a}vinghoff",
title = "Thread-Based Virtual Duplex Systems in Embedded
journal = j-IEEE-MICRO,
volume = "25",
number = "2",
pages = "60--69",
month = mar # "\slash " # apr,
year = "2005",
DOI = "https://doi.org/10.1109/MM.2005.39",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 20 08:11:29 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2060abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Poonacha Kongetira and Kathirgamar Aingaran and Kunle
title = "{Niagara}: a 32-Way Multithreaded {Sparc} Processor",
journal = j-IEEE-MICRO,
volume = "25",
number = "2",
pages = "21--29",
month = mar # "\slash " # apr,
year = "2005",
DOI = "https://doi.org/10.1109/MM.2005.35",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 20 08:11:29 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2021abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Xiaoye S. Li",
title = "An overview of {SuperLU}: {Algorithms},
implementation, and user interface",
journal = j-TOMS,
volume = "31",
number = "3",
pages = "302--325",
month = sep,
year = "2005",
DOI = "https://doi.org/10.1145/1089014.1089017",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Wed Oct 5 07:43:35 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We give an overview of the algorithms, design
philosophy, and implementation techniques in the
software SuperLU, for solving sparse unsymmetric linear
systems. In particular, we highlight the differences
between the sequential SuperLU (including its
multithreaded extension) and parallel SuperLU_DIST.
These include the numerical pivoting strategy, the
ordering strategy for preserving sparsity, the ordering
in which the updating tasks are performed, the
numerical kernel, and the parallelization strategy.
Because of the scalability concern, the parallel code
is drastically different from the sequential one. We
describe the user interfaces of the libraries, and
illustrate how to use the libraries most efficiently
depending on some matrix characteristics. Finally, we
give some examples of how the solver has been used in
large-scale scientific applications, and the
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Keith Loepere",
title = "Stackable thread mechanisms",
journal = j-OPER-SYS-REV,
volume = "39",
number = "4",
pages = "4--17",
month = oct,
year = "2005",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:53 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "H. M. Mathis and A. E. Mericas and J. D. McCalpin and
R. J. Eickemeyer and S. R. Kunkel",
title = "Characterization of simultaneous multithreading
({SMT}) efficiency in {POWER5}",
journal = j-IBM-JRD,
volume = "49",
number = "4/5",
pages = "555--564",
month = "????",
year = "2005",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Wed Oct 5 07:12:31 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/494/mathis.html",
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
author = "Cameron McNairy and Rohit Bhatia",
title = "{Montecito}: a Dual-Core, Dual-Thread {Itanium}
journal = j-IEEE-MICRO,
volume = "25",
number = "2",
pages = "10--20",
month = mar # "\slash " # apr,
year = "2005",
DOI = "https://doi.org/10.1109/MM.2005.34",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Apr 20 08:11:29 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2010abs.htm;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Jayaram Mudigonda and Harrick M. Vin and Raj
title = "Managing memory access latency in packet processing",
journal = j-SIGMETRICS,
volume = "33",
number = "1",
pages = "396--397",
month = jun,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1145/1064212.1064272",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Jun 27 09:21:27 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this study, we refute the popular belief [1,2] that
packet processing does not benefit from data-caching.
We show that a small data-cache of 8KB can bring down
the packet processing time by much as 50-90\%, while
reducing the off-chip memory bandwidth usage by about
60-95\%. We also show that, unlike general-purpose
computing, packet processing, due to its
memory-intensive nature, cannot rely exclusively on
data-caching to eliminate the memory bottleneck
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "data-caches; multithreading; network processors",
author = "Vlad Petric and Amir Roth",
title = "Energy-Effectiveness of Pre-Execution and Energy-Aware
{P}-Thread Selection",
journal = j-COMP-ARCH-NEWS,
volume = "33",
number = "2",
pages = "322--333",
month = may,
year = "2005",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:40:51 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John
M. Tracey",
title = "Evaluating the impact of simultaneous multithreading
on network servers using real hardware",
journal = j-SIGMETRICS,
volume = "33",
number = "1",
pages = "315--326",
month = jun,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1145/1071690.1064254",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Jun 27 09:21:27 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper examines the performance of simultaneous
multithreading (SMT) for network servers using actual
hardware, multiple network server applications, and
several workloads. Using three versions of the Intel
Xeon processor with Hyper-Threading, we perform
macroscopic analysis as well as microarchitectural
measurements to understand the origins of the
performance bottlenecks for SMT processors in these
environments. The results of our evaluation suggest
that the current SMT support in the Xeon is application
and workload sensitive, and may not yield significant
benefits for network servers. In general, we find that
enabling SMT on real hardware usually produces only
slight performance gains, and can sometimes lead to
performance loss. In the uniprocessor case, previous
studies appear to have neglected the OS overhead in
switching from a uniprocessor kernel to an SMT-enabled
kernel. The performance loss associated with such
support is comparable to the gains provided by SMT. In
the 2-way multiprocessor case, the higher number of
memory references from SMT often causes the memory
system to become the bottleneck, offsetting any
processor utilization gains. This effect is compounded
by the growing gap between processor speeds and memory
latency. In trying to understand the large gains shown
by simulation studies, we find that while the general
trends for microarchitectural behavior agree with real
hardware, differences in sizing assumptions and
performance models yield much more optimistic benefits
for SMT than we observe.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "network server; simultaneous multithreading(SMT)",
author = "Raimi Rufai and Muslim Bozyigit and Jaralla Alghamdi
and Moataz Ahmed",
title = "Multithreaded Parallelism with {OpenMP}",
volume = "15",
number = "4",
pages = "367--378",
month = dec,
year = "2005",
DOI = "https://doi.org/10.1142/S0129626405002283",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Sep 2 09:08:11 MDT 2010",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Resit Sendag and Ying Chen and David J. Lilja",
title = "The Impact of Incorrectly Speculated Memory Operations
in a Multithreaded Architecture",
volume = "16",
number = "3",
pages = "271--285",
month = mar,
year = "2005",
DOI = "https://doi.org/10.1109/TPDS.2005.36",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Nov 10 08:30:29 MST 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Y. Shinjo and C. Pu",
title = "Achieving efficiency and portability in systems
software: a case study on {POSIX}-compliant
multithreaded programs",
volume = "31",
number = "9",
pages = "785--800",
month = sep,
year = "2005",
DOI = "https://doi.org/10.1109/TSE.2005.98",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 11:00:42 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1514446",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Robert F. St{\"a}rk",
title = "Formal specification and verification of the {C\#}
thread model",
journal = j-THEOR-COMP-SCI,
volume = "343",
number = "3",
pages = "482--508",
day = "17",
month = oct,
year = "2005",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Tue Mar 29 06:48:50 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present a high-level Abstract State Machine (ASM)
model of C\# threads and the .NET memory model. We
focus on purely managed, fully portable threading
features of C\#. The sequential model interleaves the
computation steps of the currently running threads and
is suitable for uniprocessors. The parallel model
addresses problems of true concurrency on
multi-processor systems. The models provide a sound
basis for the development of multi-threaded
applications in C\#. The thread and memory models
complete the abstract operational semantics of C\# in
[B{\"o}rger et al. Theoret. Comput. Sci., to appear].
The main invariants of the thread model concerning
locks, monitors and mutual exclusion are formally
verified in the AsmTP system, an interactive proof
assistant based on ASM logic.",
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "Robert Steinke and Micah Clark and Elihu McMahon",
title = "A new pattern for flexible worker threads with
in-place consumption message queues",
journal = j-OPER-SYS-REV,
volume = "39",
number = "2",
pages = "71--73",
month = apr,
year = "2005",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Sat Aug 26 08:55:43 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
author = "H{\aa}kan Sundell and Philippas Tsigas",
title = "Fast and lock-free concurrent priority queues for
multi-thread systems",
journal = j-J-PAR-DIST-COMP,
volume = "65",
number = "5",
pages = "609--627",
month = may,
year = "2005",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Jul 11 20:32:33 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Xinmin Tian and Milind Girkar and Aart Bik and Hideki
title = "Practical Compiler Techniques on Efficient
Multithreaded Code Generation for {OpenMP} Programs",
journal = j-COMP-J,
volume = "48",
number = "5",
pages = "588--601",
month = sep,
year = "2005",
DOI = "https://doi.org/10.1093/comjnl/bxh109",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Tue Nov 8 05:58:50 MST 2005",
bibsource = "http://comjnl.oxfordjournals.org/content/vol48/issue5/index.dtl;
URL = "http://comjnl.oxfordjournals.org/cgi/content/abstract/48/5/588;
acknowledgement = ack-nhfb,
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
author = "Neil Vachharajani and Matthew Iyer and Chinmay Ashok
and Manish Vachharajani and David I. August and Daniel
title = "Chip multi-processor scalability for single-threaded
journal = j-COMP-ARCH-NEWS,
volume = "33",
number = "4",
pages = "44--53",
month = nov,
year = "2005",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri May 12 09:41:08 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Martin Abadi and Cormac Flanagan and Stephen N.
title = "Types for safe locking: {Static} race detection for
journal = j-TOPLAS,
volume = "28",
number = "2",
pages = "207--255",
month = mar,
year = "2006",
DOI = "https://doi.org/10.1145/1119479.1119480",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Mar 10 18:46:58 MST 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "This article presents a static race-detection analysis
for multithreaded shared-memory programs, focusing on
the Java programming language. The analysis is based on
a type system that captures many common synchronization
patterns. It supports classes with internal
synchronization, classes that require client-side
synchronization, and thread-local classes. In order to
demonstrate the effectiveness of the type system, we
have implemented it in a checker and applied it to over
40,000 lines of hand-annotated Java code. We found a
number of race conditions in the standard Java
libraries and other test programs. The checker required
fewer than 20 additional type annotations per 1,000
lines of code. This article also describes two
improvements that facilitate checking much larger
programs: an algorithm for annotation inference and a
user interface that clarifies warnings generated by the
checker. These extensions have enabled us to use the
checker for identifying race conditions in large-scale
software systems with up to 500,000 lines of code.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Onur Acii{\c{c}}mez and {\c{C}}etin Kaya Ko{\c{c}} and
Jean-Pierre Seifert",
title = "On the Power of Simple Branch Prediction Analysis",
type = "Technical report",
institution = "School of EECS, Oregon State University",
address = "Corvallis, OR 97331, USA",
month = oct,
year = "2006",
bibdate = "Mon Nov 20 14:57:23 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2000.bib;
URL = "http://eprint.iacr.org/2006/351;
abstract = "Very recently, a new software side-channel attack,
called Branch Prediction Analysis (BPA) attack, has
been discovered and also demonstrated to be practically
feasible on popular commodity PC platforms. While the
above recent attack still had the flavor of a classical
timing attack against RSA, where one uses many
execution-time measurements under the same key in order
to statistically amplify some small but key-dependent
timing differences, we dramatically improve upon the
former result. We prove that a carefully written
spy-process running simultaneously with an RSA-process,
is able to collect during one \emph{single} RSA signing
execution almost all of the secret key bits. We call
such an attack, analyzing the CPU's Branch Predictor
states through spying on a single quasi-parallel
computation process, a \emph{Simple Branch Prediction
Analysis (SBPA)} attack --- sharply differentiating it
from those one relying on statistical methods and
requiring many computation measurements under the same
key. The successful extraction of almost all secret key
bits by our SBPA attack against an openSSL RSA
implementation proves that the often recommended
blinding or so called randomization techniques to
protect RSA against side-channel attacks are, in the
context of SBPA attacks, totally useless. Additional to
that very crucial security implication, targeted at
such implementations which are assumed to be at least
statistically secure, our successful SBPA attack also
bears another equally critical security implication.
Namely, in the context of simple side-channel attacks,
it is widely believed that equally balancing the
operations after branches is a secure countermeasure
against such simple attacks. Unfortunately, this is not
true, as even such ``balanced branch'' implementations
can be completely broken by our SBPA attacks. Moreover,
despite sophisticated hardware-assisted partitioning
methods such as memory protection, sandboxing or even
virtualization, SBPA attacks empower an unprivileged
process to successfully attack other processes running
in parallel on the same processor. Thus, we conclude
that SBPA attacks are much more dangerous than
previously anticipated, as they obviously do not belong
to the same category as pure timing attacks.",
acknowledgement = ack-nhfb,
keywords = "implementation / Branch Prediction; Modular
Exponentiation; RSA; Side Channel Analysis;
Simultaneous Multithreading; Trusted Computing",
author = "Ali-Reza Adl-Tabatabai and Brian T. Lewis and Vijay
Menon and Brian R. Murphy and Bratin Saha and Tatiana
title = "Compiler and runtime support for efficient software
transactional memory",
journal = j-SIGPLAN,
volume = "41",
number = "6",
pages = "26--37",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1145/1133981.1133985",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:42:48 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Programmers have traditionally used locks to
synchronize concurrent access to shared data.
Lock-based synchronization, however, has well-known
pitfalls: using locks for fine-grain synchronization
and composing code that already uses locks are both
difficult and prone to deadlock. Transactional memory
provides an alternate concurrency control mechanism
that avoids these pitfalls and significantly eases
concurrent programming. Transactional memory language
constructs have recently been proposed as extensions to
existing languages or included in new concurrent
language specifications, opening the door for new
compiler optimizations that target the overheads of
transactional memory. This paper presents compiler and
runtime optimizations for transactional memory language
constructs. We present a high-performance software
transactional memory system (STM) integrated into a
managed runtime environment. Our system efficiently
implements nested transactions that support both
composition of transactions and partial roll back. Our
JIT compiler is the first to optimize the overheads of
STM, and we show novel techniques for enabling JIT
optimizations on STM operations. We measure the
performance of our optimizations on a 16-way SMP
running multi-threaded transactional workloads. Our
results show that these techniques enable transactional
memory's performance to compete with that of well-tuned
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "code generation; compiler optimizations; locking;
synchronization; transactional memory; virtual
author = "T. Agerwala and M. Gupta",
title = "Systems research challenges: a scale-out perspective",
journal = j-IBM-JRD,
volume = "50",
number = "2/3",
pages = "173--??",
month = mar # " \slash " # may,
year = "2006",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Fri Feb 9 20:16:31 MST 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/502/agerwala.html",
abstract = "A scale-out system is a collection of interconnected,
modular, low- cost computers that work as a single
entity to cooperatively provide applications, systems
resources, and data to users. The dominant programming
model for such systems consists of message passing at
the systems level and multithreading at the element
level. Scale-out computers have traditionally been
developed and deployed to provide levels of performance
(throughput and parallel processing) beyond what was
achievable by large shared-memory computers that
utilized the fastest processors and the most expensive
memory systems. Today, exploiting scale-out at all
levels in systems is becoming imperative in order to
overcome a fundamental discontinuity in the development
of microprocessor technology caused by power
dissipation. The pervasive use of greater levels of
scale-out, on the other hand, creates its own
challenges in architecture, programming, systems
management, and reliability. This position paper
identifies some of the important research problems that
must be addressed in order to deal with the technology
disruption and fully realize the opportunity offered by
scale-out. Our examples are based on parallelism, but
the challenges we identify apply to scale-out more
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
ordernumber = "G322-0247-00",
author = "D. F. Bacon and X. Shen",
title = "Braids and fibers: Language constructs with
architectural support for adaptive responses to memory
journal = j-IBM-JRD,
volume = "50",
number = "2/3",
pages = "209--??",
month = mar # " \slash " # may,
year = "2006",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Fri Feb 9 20:16:31 MST 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/502/bacon.html",
abstract = "As processor speeds continue to increase at a much
higher rate than memory speeds, memory latencies may
soon approach a thousand processor cycles. As a result,
the flat memory model that was made practical by deeply
pipelined superscalar processors with multilevel caches
will no longer be tenable. The most common approach to
this problem is multithreading; however, multithreading
requires either abundant independent applications or
well-parallelized monolithic applications, and neither
is easy to come by. We present high-level programming
constructs called braids and fibers. The programming
constructs facilitate the creation of programs that are
partially ordered, in which the partial orders can be
used to support adaptive responses to memory access
latencies. Braiding is simpler than parallelizing,
while yielding many of the same benefits. We show how
the programming constructs can be effectively supported
with simple instruction set architecture extensions and
microarchitectural enhancements. We have developed
braided versions of a number of important algorithms.
The braided code is easy to understand at the source
level and can be translated into highly efficient
instructions using our architecture extensions.",
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
ordernumber = "G322-0247-00",
author = "Claudio Basile and Zbigniew Kalbarczyk and Ravishankar
K. Iyer",
title = "Active Replication of Multithreaded Applications",
volume = "17",
number = "5",
pages = "448--465",
month = may,
year = "2006",
DOI = "https://doi.org/10.1109/TPDS.2006.56",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Jul 3 14:26:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
URL = "http://csdl.computer.org/comp/trans/td/2006/05/l0448s.pdf",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Colin Blundell and Dimitra Giannakopoulou and Corina
S. P{\u{a}}s{\u{a}}reanu",
title = "Assume-guarantee testing",
journal = j-SIGSOFT,
volume = "31",
number = "2",
pages = "1:1--1:??",
month = mar,
year = "2006",
DOI = "https://doi.org/10.1145/1108768.1123060",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:15:15 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Verification techniques for component-based systems
should ideally be able to predict properties of the
assembled system through analysis of individual
components before assembly. This work introduces such a
modular technique in the context of testing.
Assume-guarantee testing relies on the (automated)
decomposition of key system-level requirements into
local component requirements at design time. Developers
can verify the local requirements by checking
components in isolation; failed checks may indicate
violations of system requirements, while valid traces
from different components compose via the
assume-guarantee proof rule to potentially provide
system coverage. These local requirements also form the
foundation of a technique for efficient predictive
testing of assembled systems: given a correct system
run, this technique can predict violations by
alternative system runs without constructing those
runs. We discuss the application of our approach to
testing a multi-threaded NASA application, where we
treat threads as components.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "C. Blundell and E. C. Lewis and M. M. K. Martin",
title = "Subtleties of transactional memory atomicity
volume = "5",
number = "2",
pages = "17--17",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Transactional memory has great potential for
simplifying multithreaded programming by allowing
programmers to specify regions of the program that must
appear to execute atomically. Transactional memory
implementations then optimistically execute these
transactions concurrently to obtain high performance.
This work shows that the same atomic guarantees that
give transactions their power also have unexpected and
potentially serious negative effects on programs that
were written assuming narrower scopes of atomicity. We
make four contributions: (1) we show that a direct
translation of lock-based critical sections into
transactions can introduce deadlock into otherwise
correct programs, (2) we introduce the terms strong
atomicity and weak atomicity to describe the
interaction of transactional and non-transactional
code, (3) we show that code that is correct under weak
atomicity can deadlock under strong atomicity, and (4)
we demonstrate that sequentially composing
transactional code can also introduce deadlocks. These
observations invalidate the intuition that transactions
are strictly safer than lock-based critical sections,
that strong atomicity is strictly safer than weak
atomicity, and that transactions are always
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer languages; Computer Systems Organization;
Concurrent distributed and parallel languages;
deadlock; direct translation; Hardware; Information
science; Interference; Interleaved codes; Language
Classifications; Law; lock-based critical sections;
Multi-core/single-chip multiprocessors;
multi-threading; Multiple Data Stream Architectures
(Multiprocessors); multithreaded programming;
nontransactional code; operating systems (computers);
Parallel Architectures; Processor Architectures;
program verification; Programming Languages;
Programming profession; sequentially composing
transactional code; Software performance;
Software/Software Engineering; strong atomicity; System
recovery; Transaction databases; transaction
processing; transactional memory atomicity semantics;
weak atomicity",
author = "A. Bracy and K. Doshi and Q. Jacobson",
title = "Disintermediated Active Communication",
volume = "5",
number = "2",
pages = "15--15",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Disintermediated active communication (DAC) is a new
paradigm of communication in which a sending thread
actively engages a receiving thread when sending it a
message via shared memory. DAC is different than
existing approaches that use passive communication
through shared-memory --- based on intermittently
checking for messages --- or that use preemptive
communication but must rely on intermediaries such as
the operating system or dedicated interrupt channels.
An implementation of DAC builds on existing cache
coherency support and exploits light-weight user-level
interrupts. Inter-thread communication occurs via
monitored memory locations where the receiver thread
responds to invalidations of monitored addresses with a
light-weight user-level software-defined handler.
Address monitoring is supported by cache line
user-bits, or CLUbits. CLUbits reside in the cache next
to the coherence state, are private per thread, and
maintain user-defined per-cache-line state. A light
weight software library can demultiplex asynchronous
notifications and handle exceptional cases. In
DAC-based programs threads coordinate with one another
by explicit signaling and implicit resource monitoring.
With the simple and direct communication primitives of
DAC, multi-threaded workloads synchronize at a finer
granularity and more efficiently utilize the hardware
of upcoming multi-core designs. This paper introduces
DAC, presents several signaling models for DAC-based
programs, and describes a simple memory-based framework
that supports DAC by leveraging existing
cache-coherency models. Our framework is general enough
to support uses beyond DAC",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address monitoring; cache coherency; cache line
user-bits; cache storage; CLUbits; Computer aided
instruction; Concurrent computing; disintermediated
active communication; Hardware; High performance
computing; interrupts; interthread communication;
memory locations; Monitoring; multi-threading;
multicore designs; Operating systems; Processor
scheduling; Programming profession; resource
monitoring; shared memory; shared memory systems;
signaling models; software libraries; Software
libraries; software library; storage allocation;
user-level interrupts",
author = "Marcin Brzuszek and Andrzej Daniluk",
title = "Multithreaded transactions in scientific computing:
New versions of a computer program for kinematical
calculations of {RHEED} intensity oscillations",
journal = j-COMP-PHYS-COMM,
volume = "175",
number = "10",
pages = "678--681",
day = "15",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1016/j.cpc.2006.06.013",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Mon Feb 13 23:42:10 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465506002979",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "Christophe C{\'e}rin and Jean-Luc Gaudiot and Michel
title = "A Multithreaded {SQL} Service",
volume = "16",
number = "2",
pages = "245--259",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1142/S0129626406002605",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Sep 2 09:08:11 MDT 2010",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Koushik Chakraborty and Philip M. Wells and Gurindar
S. Sohi",
title = "Computation spreading: employing hardware migration to
specialize {CMP} cores on-the-fly",
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "283--292",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168919.1168893",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In canonical parallel processing, the operating system
(OS) assigns a processing core to a single thread from
a multithreaded server application. Since different
threads from the same application often carry out
similar computation, albeit at different times, we
observe extensive code reuse among different
processors, causing redundancy (e.g., in our server
workloads, 45-65\% of all instruction blocks are
accessed by all processors). Moreover, largely
independent fragments of computation compete for the
same private resources causing destructive
interference. Together, this redundancy and
interference lead to poor utilization of private
microarchitecture resources such as caches and branch
predictors. We present Computation Spreading (CSP),
which employs hardware migration to distribute a
thread's dissimilar fragments of computation across the
multiple processing cores of a chip multiprocessor
(CMP), while grouping similar computation fragments
from different threads together. This paper focuses on
a specific example of CSP for OS intensive server
applications: separating application level (user)
computation from the OS calls it makes. When performing
CSP, each core becomes temporally specialized to
execute certain computation fragments, and the same
core is repeatedly used for such fragments. We examine
two specific thread assignment policies for CSP, and
show that these policies, across four server workloads,
are able to reduce instruction misses in private L2
caches by 27-58\%, private L2 load misses by 0-19\%,
and branch mispredictions by 9-25\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "cache locality; dynamic specialization",
author = "Weihaw Chuang and Satish Narayanasamy and Ganesh
Venkatesh and Jack Sampson and Michael {Van Biesbrouck}
and Gilles Pokam and Brad Calder and Osvaldo Colavin",
title = "Unbounded page-based transactional memory",
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "347--358",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168918.1168901",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Exploiting thread level parallelism is paramount in
the multicore era. Transactions enable programmers to
expose such parallelism by greatly simplifying the
multi-threaded programming model. Virtualized
transactions (unbounded in space and time) are
desirable, as they can increase the scope of
transactions' use, and thereby further simplify a
programmer's job. However, hardware support is
essential to support efficient execution of unbounded
transactions. In this paper, we introduce Page-based
Transactional Memory to support unbounded transactions.
We combine transaction bookkeeping with the virtual
memory system to support fast transaction conflict
detection, commit, abort, and to maintain transactions'
speculative data.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; parallel programming; transactional
memory; transactions; virtual memory",
author = "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
title = "Exploiting Narrow Values for Soft Error Tolerance",
volume = "5",
number = "2",
pages = "12--12",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Soft errors are an important challenge in contemporary
microprocessors. Particle hits on the components of a
processor are expected to create an increasing number
of transient errors with each new microprocessor
generation. In this paper we propose simple mechanisms
that effectively reduce the vulnerability to soft
errors In a processor. Our designs are generally
motivated by the fact that many of the produced and
consumed values in the processors are narrow and their
upper order bits are meaningless. Soft errors canted by
any particle strike to these higher order bits can be
avoided by simply identifying these narrow values.
Alternatively soft errors can be detected or corrected
on the narrow values by replicating the vulnerable
portion of the value inside the storage space provided
for the upper order bits of these operands. We offer a
variety of schemes that make use of narrow values and
analyze their efficiency in reducing soft error
vulnerability of level-1 data cache of the processor",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Cache storage; contemporary
microprocessors; data cache; Data Cache; Error
correction; error correction; Error Correction; error
correction; error detection; Hardware; Impurities;
Manufacturing; microprocessor chips; Microprocessors;
Multithreading; Narrow Values; narrow values; Neutrons;
particle strike; Process design; radiation effects;
Random access memory; soft error tolerance; Soft
Errors; system recovery; transient errors; transients",
author = "Michael Factor and Assaf Schuster and Konstantin
title = "A Platform-Independent Distributed Runtime for
Standard Multithreaded {Java}",
journal = j-INT-J-PARALLEL-PROG,
volume = "34",
number = "2",
pages = "113--142",
month = apr,
year = "2006",
DOI = "https://doi.org/10.1007/s10766-006-0007-0",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:05:55 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=2&spage=113",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "bytecode instrumentation; distributed computing;
distributed shared memory; Java",
author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam",
title = "Scheduling communication in multithreaded programs:
experimental results",
journal = j-CCPE,
volume = "18",
number = "1",
pages = "1--28",
month = jan,
year = "2006",
DOI = "https://doi.org/10.1002/cpe.904",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:00 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "13 Sep 2005",
author = "Juan Carlos Gomez and Jorge R. Ramos and Vernon Rego",
title = "Signals, timers, and continuations for multithreaded
user-level protocols",
journal = j-SPE,
volume = "36",
number = "5",
pages = "449--471",
day = "25",
month = apr,
year = "2006",
DOI = "https://doi.org/10.1002/spe.700",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Wed Oct 17 18:33:12 MDT 2007",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "19 Jan 2006",
author = "Clemens Grelck and Sven-Bodo Scholz",
title = "{SAC} --- a Functional Array Language for Efficient
Multi-threaded Execution",
journal = j-INT-J-PARALLEL-PROG,
volume = "34",
number = "4",
pages = "383--427",
month = aug,
year = "2006",
DOI = "https://doi.org/10.1007/s10766-006-0018-x",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:06:07 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=4;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=4&spage=383",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Compiler optimisation; data parallel programming;
multi-threading; Single Assignment C",
author = "Claude Kaiser and Jean-Fran{\c{c}}ois Pradat-Peyre and
Sami {\'E}vangelista and Pierre Rousseau",
title = "Comparing {Java}, {C\#} and {Ada} monitors queuing
policies: a case study and its {Ada} refinement",
journal = j-SIGADA-LETTERS,
volume = "26",
number = "2",
pages = "23--37",
month = aug,
year = "2006",
DOI = "https://doi.org/10.1145/1165678.1165681",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Tue Jun 17 09:16:14 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Learning concurrency paradigms is necessary but it is
not sufficient since the choice of run-time semantics
may introduce subtle programming errors. It is the aim
of this paper to exemplify the importance of process
queuing and awaking policies resulting from possible
choices of the monitor concept implementation.The first
part of the paper compares the behaviour of concurrent
processes sharing a unique waiting queue for condition
synchronization when implemented in Java or in Ada. A
particular solution of the dining philosophers paradigm
will be used to show how the difference in the monitor
semantics may lead or not to deadlock. This comparison
provides insight for deriving a correct Java
implementation. The second part of the paper shows how
the implementation can be refined when using Ada entry
families and requeue with requeue once restriction. The
result is elegant, safe and fair, and deterministic.
This paper ends with quantitative comparisons of
concurrency complexity and of concurrency
effectiveness.We conclude that Java and C\#
multithreading need defensive concurrent programming
while Ada allows more latitude for developing correct
concurrent programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAda Ada Letters",
author = "Seon Wook Kim and Chong-Liang Ooi and Rudolf Eigenmann
and Babak Falsafi and T. N. Vijaykumar",
title = "Exploiting reference idempotency to reduce speculative
storage overflow",
journal = j-TOPLAS,
volume = "28",
number = "5",
pages = "942--965",
month = sep,
year = "2006",
DOI = "https://doi.org/10.1145/1152649.1152653",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Wed Sep 6 07:13:55 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Recent proposals for multithreaded architectures
employ speculative execution to allow threads with
unknown dependences to execute speculatively in
parallel. The architectures use hardware speculative
storage to buffer speculative data, track data
dependences and correct incorrect executions through
roll-backs. Because all memory references access the
speculative storage, current proposals implement
speculative storage using small memory structures to
achieve fast access. The limited capacity of the
speculative storage causes considerable performance
loss due to speculative storage overflow whenever a
thread's speculative state exceeds the speculative
storage capacity. Larger threads exacerbate the
overflow problem but are preferable to smaller threads,
as larger threads uncover more parallelism. In this
article, we discover a new program property called
memory reference idempotency. Idempotent references are
guaranteed to be eventually corrected, though the
references may be temporarily incorrect in the process
of speculation. Therefore, idempotent references, even
from nonparallelizable program sections, need not be
tracked in the speculative storage, and instead can
directly access nonspeculative storage (i.e.,
conventional memory hierarchy). Thus, we reduce the
demand for speculative storage space in large threads.
We define a formal framework for reference idempotency
and present a novel compiler-assisted speculative
execution model. We prove the necessary and sufficient
conditions for reference idempotency using our model.
We present a compiler algorithm to label idempotent
memory references for the hardware. Experimental
results show that for our benchmarks, over 60\% of the
references in nonparallelizable program sections are
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Costas Kyriacou and Paraskevas Evripidou and Pedro
title = "{CacheFlow}: Cache Optimizations for Data Driven
volume = "16",
number = "2",
pages = "229--244",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1142/S0129626406002599",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Sep 2 09:08:11 MDT 2010",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Costas Kyriacou and Paraskevas Evripidou and Pedro
title = "Data-Driven Multithreading Using Conventional
volume = "17",
number = "10",
pages = "1176--1188",
month = oct,
year = "2006",
DOI = "https://doi.org/10.1109/TPDS.2006.136",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Jul 3 14:26:50 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Edward A. Lee",
title = "The Problem with Threads",
type = "Technical Report",
number = "UCB/EECS-2006-1",
institution = "Electrical Engineering and Computer Sciences.
University of California at Berkeley",
address = "Berkeley, CA, USA",
day = "10",
month = jan,
year = "2006",
bibdate = "Thu Oct 23 15:07:59 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.html",
abstract = "Threads are a seemingly straightforward adaptation of
the dominant sequential model of computation to
concurrent systems. Languages require little or no
syntactic changes to support threads, and operating
systems and architectures have evolved to efficiently
support them. Many technologists are pushing for
increased use of multithreading in software in order to
take advantage of the predicted increases in
parallelism in computer architectures. In this paper, I
argue that this is not a good idea. Although threads
seem to be a small step from sequential computation, in
fact, they represent a huge step. They discard the most
essential and appealing properties of sequential
computation: understandability, predictability, and
determinism. Threads, as a model of computation, are
wildly nondeterministic, and the job of the programmer
becomes one of pruning that nondeterminism. Although
many research techniques improve the model by offering
more effective pruning, I argue that this is
approaching the problem backwards. Rather than pruning
nondeterminism, we should build from essentially
deterministic, composable components. Nondeterminism
should be explicitly and judiciously introduced where
needed, rather than removed where not needed. The
consequences of this principle are profound. I argue
for the development of concurrent coordination
languages based on sound, composable formalisms. I
believe that such languages will yield much more
reliable, and more concurrent programs.",
acknowledgement = ack-nhfb,
author = "S.-W. Lee and J.-L. Gaudiot",
title = "Throttling-Based Resource Management in High
Performance Multithreaded Architectures",
journal = j-IEEE-TRANS-COMPUT,
volume = "55",
number = "9",
pages = "1142--1152",
month = sep,
year = "2006",
DOI = "https://doi.org/10.1109/TC.2006.154",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Mon Jul 4 15:35:56 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1668042",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Xin Li and Marian Boldt and Reinhard von Hanxleden",
title = "Mapping {Esterel} onto a multi-threaded embedded
journal = j-COMP-ARCH-NEWS,
volume = "34",
number = "5",
pages = "303--314",
month = dec,
year = "2006",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Oct 27 06:18:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Xin Li and Marian Boldt and Reinhard von Hanxleden",
title = "Mapping {Esterel} onto a multi-threaded embedded
journal = j-OPER-SYS-REV,
volume = "40",
number = "5",
pages = "303--314",
month = dec,
year = "2006",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Oct 27 06:18:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Xin Li and Marian Boldt and Reinhard von Hanxleden",
title = "Mapping {Esterel} onto a multi-threaded embedded
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "303--314",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168857.1168896",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The synchronous language Esterel is well-suited for
programming control-dominated reactive systems at the
system level. It provides non-traditional control
structures, in particular concurrency and various forms
of preemption, which allow to concisely express
reactive behavior. As these control structures cannot
be mapped easily onto traditional, sequential
processors, an alternative approach that has emerged
recently makes use of special-purpose reactive
processors. However, the designs proposed so far have
limitations regarding completeness of the language
support, and did not really take advantage of
compile-time knowledge to optimize resource usage. This
paper presents a reactive processor, the Kiel Esterel
Processor 3a (KEP3a), and its compiler. The KEP3a
improves on earlier designs in several areas; most
notable are the support for exception handling and the
provision of context-dependent preemption handling
instructions. The KEP3a compiler presented here is to
our knowledge the first for multi-threaded reactive
processors. The translation of Esterel's preemption
constructs onto KEP3a assembler is straightforward;
however, a challenge is the correct and efficient
representation of Esterel's concurrency. The compiler
generates code that respects data and control
dependencies using the KEP3a priority-based scheduling
mechanism. We present a priority assignment approach
that makes use of a novel concurrent control flow graph
and has a complexity that in practice tends to be
linear in the size of the program. Unlike earlier
Esterel compilation schemes, this approach avoids
unnecessary context switches by considering each
thread's actual execution state at run time.
Furthermore, it avoids code replication present in
other approaches.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; Esterel; low-power processing;
multi-threading; reactive systems",
author = "Tong Li and Alvin R. Lebeck and Daniel J. Sorin",
title = "Spin Detection Hardware for Improved Management of
Multithreaded Systems",
volume = "17",
number = "6",
pages = "508--521",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1109/TPDS.2006.78",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Jul 3 14:26:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Sewon Moon and Byeong-Mo Chang",
title = "A thread monitoring system for multithreaded {Java}
journal = j-SIGPLAN,
volume = "41",
number = "5",
pages = "21--29",
month = may,
year = "2006",
DOI = "https://doi.org/10.1145/1149982.1149985",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:42:34 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To assist developing robust multithreaded software, we
develop a thread monitoring system for multithreaded
Java programs, which can trace or monitor running
threads and synchronization. We design a monitoring
system which has options to select interesting threads
and synchronized actions. Using this tool, programmers
can monitor only interesting threads and
synchronization in more details by selecting options,
and can detect a deadlock. It also provides profile
information after execution, which summarizes behavior
of running threads and synchronized actions during
execution. We implement the system based on code
inlining, and presents some experimental results.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Java; monitoring; synchronization; thread",
author = "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
Valero and E. Ayguade",
title = "Performance, power efficiency and scalability of
asymmetric cluster chip multiprocessors",
volume = "5",
number = "1",
pages = "14--17",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "This paper evaluates asymmetric cluster chip
multiprocessor (ACCMP) architectures as a mechanism to
achieve the highest performance for a given power
budget. ACCMPs execute serial phases of multithreaded
programs on large high-performance cores whereas
parallel phases are executed on a mix of large and many
small simple cores. Theoretical analysis reveals a
performance upper bound for symmetric multiprocessors,
which is surpassed by asymmetric configurations at
certain power ranges. Our emulations show that
asymmetric multiprocessors can reduce power consumption
by more than two thirds with similar performance
compared to symmetric multiprocessors",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACCMP; Application software; asymmetric cluster chip
multiprocessors; Chip Multiprocessors; Emulation;
Frequency; microprocessor chips; multi-threading;
multiprocessing systems; multithreaded program;
Optimized production technology; Parallel processing;
parallel processing; power consumption reduction; power
efficiency; Power Efficiency; Power system modeling;
Queueing analysis; Scalability; Upper bound; Voltage",
author = "Mayur Naik and Alex Aiken and John Whaley",
title = "Effective static race detection for {Java}",
journal = j-SIGPLAN,
volume = "41",
number = "6",
pages = "308--319",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1145/1133255.1134018",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:42:48 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present a novel technique for static race detection
in Java programs, comprised of a series of stages that
employ a combination of static analyses to successively
reduce the pairs of memory accesses potentially
involved in a race. We have implemented our technique
and applied it to a suite of multi-threaded Java
programs. Our experiments show that it is precise,
scalable, and useful, reporting tens to hundreds of
serious and previously unknown concurrency bugs in
large, widely-used programs with few false alarms.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; Java; multi-threading; static race
detection; synchronization",
author = "Mangala Gowri Nanda and S. Ramesh",
title = "Interprocedural slicing of multithreaded programs with
applications to {Java}",
journal = j-TOPLAS,
volume = "28",
number = "6",
pages = "1088--1144",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1186632.1186636",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Sat Apr 14 11:13:21 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Satish Narayanasamy and Cristiano Pereira and Brad
title = "Recording shared memory dependencies using strata",
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "229--240",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168857.1168886",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Significant time is spent by companies trying to
reproduce and fix bugs. BugNet and FDR are recent
architecture proposals that provide architecture
support for deterministic replay debugging. They focus
on continuously recording information about the
program's execution, which can be communicated back to
the developer. Using that information, the developer
can deterministically replay the program's execution to
reproduce and fix the bugs. In this paper, we propose
using Strata to efficiently capture the shared memory
dependencies. A stratum creates a time layer across all
the logs for the running threads, which separates all
the memory operations executed before and after the
stratum. A strata log allows us to determine all the
shared memory dependencies during replay and thereby
supports deterministic replay debugging for
multi-threaded programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "debugging; dependencies; logging; replay; shared
memory; strata",
author = "G. Ottoni and R. Rangan and A. Stoler and M. J.
Bridges and D. I. August",
title = "From sequential programs to concurrent threads",
volume = "5",
number = "1",
pages = "6--9",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Chip multiprocessors are of increasing importance due
to difficulties in achieving higher clock frequencies
in uniprocessors, but their success depends on finding
useful work for the processor cores. This paper
addresses this challenge by presenting a simple
compiler approach that extracts non-speculative
thread-level parallelism from sequential codes. We
present initial results from this technique targeting a
validated dual-core processor model, achieving speedups
ranging from 9-48\% with an average of 25\% for
important benchmark loops over their single-threaded
versions. We also identify important next steps found
during our pursuit of higher degrees of automatic
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "automatic threading; Bridges; Clocks; Computer
science; concurrency control; concurrent threads;
Frequency; Hardware; Microprocessors; multi-threading;
nonspeculative thread-level parallelism; Parallel
processing; Pipeline processing; program compiler;
program compilers; Program processors; sequential
author = "Angshuman Parashar and Anand Sivasubramaniam and
Sudhanva Gurumurthi",
title = "{SlicK}: slice-based locality exploitation for
efficient redundant multithreading",
journal = j-COMP-ARCH-NEWS,
volume = "34",
number = "5",
pages = "95--105",
month = dec,
year = "2006",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Oct 27 06:18:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Angshuman Parashar and Anand Sivasubramaniam and
Sudhanva Gurumurthi",
title = "{SlicK}: slice-based locality exploitation for
efficient redundant multithreading",
journal = j-OPER-SYS-REV,
volume = "40",
number = "5",
pages = "95--105",
month = dec,
year = "2006",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Oct 27 06:18:30 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Angshuman Parashar and Anand Sivasubramaniam and
Sudhanva Gurumurthi",
title = "{SlicK}: slice-based locality exploitation for
efficient redundant multithreading",
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "95--105",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168857.1168870",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Transient faults are expected a be a major design
consideration in future microprocessors. Recent
proposals for transient fault detection in processor
cores have revolved around the idea of redundant
threading, which involves redundant execution of a
program across multiple execution contexts. This paper
presents a new approach to redundant threading by
bringing together the concepts of slice-level execution
and value and control-flow locality into a novel
partial redundant threading mechanism called SlicK .The
purpose of redundant execution is to check the
integrity of the outputs propagating out of the core
(typically through stores). SlicK implements redundancy
at the granularity of backward-slices of these output
instructions and exploits value and control-flow
locality to avoid redundantly executing slices that
lead to predictable outputs, thereby avoiding redundant
execution of a significant fraction of instructions
while maintaining extremely low vulnerabilities for
critical processor structures. We propose the
microarchitecture of a backward-slice extractor called
SliceEM that is able to identify backward slices
without interrupting the instruction flow, and show how
this extractor and a set of predictors can be
integrated into a redundant threading mechanism to form
SlicK. Detailed simulations with SPEC CPU2000
benchmarks show that SlicK can provide around 10.2\%
performance improvement over a well known redundant
threading mechanism, buying back over 50\% of the loss
suffered due to redundant execution. SlicK can keep the
Architectural Vulnerability Factors of processor
structures to typically 0\%-2\%. More importantly,
SlicK's slice-based mechanisms provide future
opportunities for exploring interesting points in the
performance-reliability design space based on market
segment needs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "backward slice extraction; microarchitecture;
redundant threading; transient faults",
author = "Christopher J. F. Pickett and Clark Verbrugge",
title = "{SableSpMT}: a software framework for analysing
speculative multithreading in {Java}",
journal = j-SIGSOFT,
volume = "31",
number = "1",
pages = "59--66",
month = jan,
year = "2006",
DOI = "https://doi.org/10.1145/1108768.1108809",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:15:12 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
abstract = "Speculative multithreading (SpMT) is a promising
optimisation technique for achieving faster execution
of sequential programs on multiprocessor hardware.
Analysis of and data acquisition from such systems is
however difficult and complex, and is typically limited
to a specific hardware design and simulation
environment. We have implemented a flexible,
software-based speculative multithreading architecture
within the context of a full-featured Java virtual
machine. We consider the entire Java language and
provide a complete set of support features for
speculative execution, including return value
prediction. Using our system we are able to generate
extensive dynamic analysis information, analyse the
effects of runtime feedback, and determine the impact
of incorporating static, offline information. Our
approach allows for accurate analysis of Java SpMT on
existing, commodity multiprocessor hardware, and
provides a vehicle for further experimentation with
speculative approaches and optimisations.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Polyvios Pratikakis and Jeffrey S. Foster and Michael
title = "{LOCKSMITH}: context-sensitive correlation analysis
for race detection",
journal = j-SIGPLAN,
volume = "41",
number = "6",
pages = "320--331",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1145/1133255.1134019",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:42:48 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "One common technique for preventing data races in
multi-threaded programs is to ensure that all accesses
to shared locations are consistently protected by a
lock. We present a tool called LOCKSMITH for detecting
data races in C programs by looking for violations of
this pattern. We call the relationship between locks
and the locations they protect consistent correlation,
and the core of our technique is a novel
constraint-based analysis that infers consistent
correlation context-sensitively, using the results to
check that locations are properly guarded by locks. We
present the core of our algorithm for a simple formal
language \lambda$_>$ which we have proven sound, and
discuss how we scale it up to an algorithm that aims to
be sound for all of C. We develop several techniques to
improve the precision and performance of the analysis,
including a sharing analysis for inferring thread
locality; existential quantification for modeling locks
in data structures; and heuristics for modeling unsafe
features of C such as type casts. When applied to
several benchmarks, including multi-threaded servers
and Linux device drivers, LOCKSMITH found several races
while producing a modest number of false alarm.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "context-sensitivity; correlation; locksmith;
multi-threaded programming; race detection; type
author = "Vimal K. Reddy and Eric Rotenberg and Sailashri
title = "Understanding prediction-based partial redundant
threading for low-overhead, high- coverage fault
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "83--94",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168917.1168869",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Redundant threading architectures duplicate all
instructions to detect and possibly recover from
transient faults. Several lighter weight Partial
Redundant Threading (PRT) architectures have been
proposed recently. (i) Opportunistic Fault Tolerance
duplicates instructions only during periods of poor
single-thread performance. (ii) ReStore does not
explicitly duplicate instructions and instead exploits
mispredictions among highly confident branch
predictions as symptoms of faults. (iii) Slipstream
creates a reduced alternate thread by replacing many
instructions with highly confident predictions. We
explore PRT as a possible direction for achieving the
fault tolerance of full duplication with the
performance of single-thread execution. Opportunistic
and ReStore yield partial coverage since they are
restricted to using only partial duplication or only
confident predictions, respectively. Previous analysis
of Slipstream fault tolerance was cursory and concluded
that only duplicated instructions are covered. In this
paper, we attempt to better understand Slipstream's
fault tolerance, conjecturing that the mixture of
partial duplication and confident predictions actually
closely approximates the coverage of full duplication.
A thorough dissection of prediction scenarios confirms
that faults in nearly 100\% of instructions are
detectable. Fewer than 0.1\% of faulty instructions are
not detectable due to coincident faults and
mispredictions. Next we show that the current recovery
implementation fails to leverage excellent detection
capability, since recovery sometimes initiates
belatedly, after already retiring a detected faulty
instruction. We propose and evaluate a suite of simple
microarchitectural alterations to recovery and
checking. Using the best alterations, Slipstream can
recover from faults in 99\% of instructions, compared
to only 78\% of instructions without alterations. Both
results are much higher than predicted by past
research, which claims coverage for only duplicated
instructions, or 65\% of instructions. On an 8-issue
SMT processor, Slipstream performs within 1.3\% of
single-thread execution whereas full duplication slows
performance by 14\%. A key byproduct of this paper is a
novel analysis framework in which every dynamic
instruction is considered to be hypothetically faulty,
thus not requiring explicit fault injection. Fault
coverage is measured in terms of the fraction of
candidate faulty instructions that are directly or
indirectly detectable before.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "branch prediction; chip multiprocessor (CMP);
redundant multithreading; simultaneous multithreading
(SMT); slipstream processor; time redundancy; transient
faults; value prediction",
author = "Won W. Ro and Stephen P. Crago and Alvin M. Despain
and Jean-Luc Gaudiot",
title = "Design and evaluation of a hierarchical decoupled
volume = "38",
number = "3",
pages = "237--259",
month = dec,
year = "2006",
DOI = "https://doi.org/10.1007/s11227-006-8321-2",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Jul 9 17:32:29 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=38&issue=3;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=38&issue=3&spage=237",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
keywords = "Data prefetching; Decoupled architectures; Instruction
level parallelism; Memory latency hiding;
Multithreading; Parallel architecture; Speculative
author = "Kenneth Russell and David Detlefs",
title = "Eliminating synchronization-related atomic operations
with biased locking and bulk rebiasing",
journal = j-SIGPLAN,
volume = "41",
number = "10",
pages = "263--272",
month = oct,
year = "2006",
DOI = "https://doi.org/10.1145/1167515.1167496",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:47:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The Java{\TM} programming language contains built-in
synchronization primitives for use in constructing
multithreaded programs. Efficient implementation of
these synchronization primitives is necessary in order
to achieve high performance. Recent research [9, 12,
10, 3, 7] has focused on the run-time elimination of
the atomic operations required to implement object
monitor synchronization primitives. This paper
describes a novel technique called store-free biased
locking which eliminates all synchronization-related
atomic operations on uncontended object monitors. The
technique supports the bulk transfer of object
ownership from one thread to another, and the selective
disabling of the optimization where unprofitable, using
epoch-based bulk rebiasing and revocation. It has been
implemented in the production version of the Java
HotSpot{\TM}VM and has yielded significant performance
improvements on a range of benchmarks and applications.
The technique is applicable to any virtual
machine-based programming language implementation with
mostly block-structured locking primitives.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "atomic; bias; Java; lock; monitor; optimization;
rebias; reservation; revoke; synchronization",
author = "Koushik Sen and Grigore Rosu and Gul Agha",
title = "Online efficient predictive safety analysis of
multithreaded programs",
volume = "8",
number = "3",
pages = "248--260",
month = jun,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1007/s10009-005-0192-y",
ISSN = "1433-2779 (print), 1433-2787 (electronic)",
ISSN-L = "1433-2779",
bibdate = "Wed Jul 9 18:12:21 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1433-2779&volume=8&issue=3;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1433-2779&volume=8&issue=3&spage=248",
acknowledgement = ack-nhfb,
fjournal = "International Journal on Software Tools for Technology
Transfer: STTT",
keywords = "JMPaX; Multithreaded analysis; Predictive analysis;
Runtime monitoring; Vector clock",
author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot",
title = "Adaptive dynamic thread scheduling for simultaneous
multithreaded architectures with a detector thread",
journal = j-J-PAR-DIST-COMP,
volume = "66",
number = "10",
pages = "1304--1321",
month = oct,
year = "2006",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Jul 11 20:32:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Geoffrey Smith",
title = "Improved typings for probabilistic noninterference in
a multi-threaded language",
journal = j-J-COMP-SECUR,
volume = "14",
number = "6",
pages = "591--623",
month = "????",
year = "2006",
DOI = "https://doi.org/10.3233/JCS-2006-14605",
ISSN = "0926-227X (print), 1875-8924 (electronic)",
ISSN-L = "0926-227X",
bibdate = "Tue May 24 06:23:23 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computer Security",
journal-URL = "http://content.iospress.com/journals/journal-of-computer-security",
author = "Pedro Trancoso and Paraskevas Evripidou and Kyriakos
Stavrou and Costas Kyriacou",
title = "A Case for Chip Multiprocessors Based on the
Data-Driven Multithreading Model",
journal = j-INT-J-PARALLEL-PROG,
volume = "34",
number = "3",
pages = "213--235",
month = jun,
year = "2006",
DOI = "https://doi.org/10.1007/s10766-006-0016-z",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:05:59 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=3;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=3&spage=213",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Chip multiprocessor; data-driven execution;
multithreading; parallel processing",
author = "Vasco T. Vasconcelos and Simon J. Gay and Ant{\'o}nio
title = "Type checking a multithreaded functional language with
session types",
journal = j-THEOR-COMP-SCI,
volume = "368",
number = "1--2",
pages = "64--87",
day = "5",
month = dec,
year = "2006",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Tue Mar 29 08:55:29 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "L. Wang and S. D. Stoller",
title = "Runtime analysis of atomicity for multithreaded
volume = "32",
number = "2",
pages = "93--110",
month = feb,
year = "2006",
DOI = "https://doi.org/10.1109/TSE.2006.1599419",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 11:00:42 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1599419",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Min Xu and Mark D. Hill and Rastislav Bodik",
title = "A regulated transitive reduction {(RTR)} for longer
memory race recording",
journal = j-SIGPLAN,
volume = "41",
number = "11",
pages = "49--60",
month = nov,
year = "2006",
DOI = "https://doi.org/10.1145/1168919.1168865",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:49:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreaded deterministic replay has important
applications in cyclic debugging, fault tolerance and
intrusion analysis. Memory race recording is a key
technology for multithreaded deterministic replay. In
this paper, we considerably improve our previous
always-on Flight Data Recorder (FDR) in four ways:\par
\begin{itemize} \item Longer recording by reducing the
log size growth rate to approximately one byte per
thousand dynamic instructions. \item Lower hardware
cost by reducing the cost to 24 KB per processor core.
\item Simpler design by modifying only the cache
coherence protocol, but not the cache. \item Broader
applicability by supporting both Sequential Consistency
(SC) and Total Store Order (TSO) memory consistency
models (existing recorders support only SC).
\end{itemize} These improvements stem from several
ideas: (1) a Regulated Transitive Reduction (RTR)
recording algorithm that creates stricter and
vectorizable dependencies to reduce the log growth
rate; (2) a Set/LRU timestamp approximation method that
better approximates timestamps of uncached memory
locations to reduce the hardware cost; (3) an
order-value-hybrid recording method that explicitly
logs the value of potential SC-violating load
instructions to support multiprocessor systems with
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "determinism; multithreading; race recording",
author = "Lukasz Ziarek and Philip Schatz and Suresh
title = "Stabilizers: a modular checkpointing abstraction for
concurrent functional programs",
journal = j-SIGPLAN,
volume = "41",
number = "9",
pages = "136--147",
month = sep,
year = "2006",
DOI = "https://doi.org/10.1145/1160074.1159822",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:46:22 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Transient faults that arise in large-scale software
systems can often be repaired by re-executing the code
in which they occur. Ascribing a meaningful semantics
for safe re-execution in multi-threaded code is not
obvious, however. For a thread to correctly re-execute
a region of code, it must ensure that all other threads
which have witnessed its unwanted effects within that
region are also reverted to a meaningful earlier state.
If not done properly, data inconsistencies and other
undesirable behavior may result. however, automatically
determining what constitutes a consistent global
checkpoint is not straightforward since thread
interactions are a dynamic property of the program. In
this paper, we present a safe and efficient
checkpointing mechanism for Concurrent ML (CML) that
can be used to recover from transient faults. We
introduce a new linguistic abstraction called
stabilizers that permits the specification of
per-thread monitors and the restoration of globally
consistent checkpoints. Safe global states are computed
through lightweight monitoring of communication events
among threads (e.g. message-passing operations or
updates to shared variables). Our experimental results
on several realistic, multithreaded, server-style CML
applications, including a web server and a windowing
toolkit, show that the overheads to use stabilizers are
small, and lead us to conclude that they are a viable
mechanism for defining safe checkpoints in concurrent
functional programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "checkpointing; concurrent ML; concurrent programming;
error recovery; exception handling; transactions",
author = "Tamar Benaya and Ela Zur",
title = "Understanding threads in an advanced {Java} course",
journal = j-SIGCSE,
volume = "39",
number = "3",
pages = "323--323",
month = sep,
year = "2007",
DOI = "https://doi.org/10.1145/1269900.1268890",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:57:36 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "Proceedings of the 12th Annual SIGCSE Conference on
Innovation and Technology in Computer Science Education
abstract = "This poster describes difficulties in understanding
threads in an Advanced Java course given at the
Computer Science department of the Open University of
Israel (OUI). We present a typical question which
focuses on several aspects of multi-threaded
programming given in an exam. We discuss the students'
answers and point to typical misunderstandings of the
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Peter Benner and Maribel Castillo and Rafael Mayo and
Enrique S. Quintana-Ort{\'\i} and Gregorio
title = "Stabilizing large-scale generalized systems on
parallel computers using multithreading and
journal = j-CCPE,
volume = "19",
number = "4",
pages = "531--542",
day = "25",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1002/cpe.1148",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:11 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "12 Dec 2006",
author = "J. A. Bergstra and C. A. Middelburg",
title = "Synchronous cooperation for explicit multi-threading",
journal = j-ACTA-INFO,
volume = "44",
number = "7--8",
pages = "525--569",
month = dec,
year = "2007",
DOI = "https://doi.org/10.1007/s00236-007-0057-9",
ISSN = "0001-5903 (print), 1432-0525 (electronic)",
ISSN-L = "0001-5903",
bibdate = "Wed Jul 9 21:28:19 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0001-5903&volume=44&issue=7;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0001-5903&volume=44&issue=7&spage=525",
acknowledgement = ack-nhfb,
fjournal = "Acta Informatica",
journal-URL = "http://www.springerlink.com/content/0001-5903",
author = "Colin Blundell and Joe Devietti and E. Christopher
Lewis and Milo M. K. Martin",
title = "Making the fast case common and the uncommon case
simple in unbounded transactional memory",
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "2",
pages = "24--34",
month = may,
year = "2007",
DOI = "https://doi.org/10.1145/1273440.1250667",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:48:43 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Hardware transactional memory has great potential to
simplify the creation of correct and efficient
multithreaded programs, allowing programmers to exploit
more effectively the soon-to-be-ubiquitous multi-core
designs. Several recent proposals have extended the
original bounded transactional memory to unbounded
transactional memory, a crucial step toward
transactions becoming a general-purpose primitive.
Unfortunately, supporting the concurrent execution of
an unbounded number of unbounded transactions is
challenging, and as a result, many proposed
implementations are complex.\par
This paper explores a different approach. First, we
introduce the permissions-only cache to extend the
bound at which transactions overflow to allow the fast,
bounded case to be used as frequently as possible.
Second, we propose OneTM to simplify the implementation
of unbounded transactional memory by bounding the
concurrency of transactions that overflow the cache.
These mechanisms work synergistically to provide a
simple and fast unbounded transactional memory
The permissions-only cache efficiently maintains the
coherence permissions --- but not data-for blocks read
or written transactionally that have been evicted from
the processor's caches. By holding coherence
permissions for these blocks, the regular cache
coherence protocol can be used to detect transactional
conflicts using only a few bits of on-chip storage per
overflowed cache block. OneTM allows only one
overflowed transaction at a time, relying on the
permissions-only cache to ensure that overflow is
infrequent. We present two implementations. In
OneTM-Serialized, an overflowed transaction simply
stalls all other threads in the application.\par
In OneTM-Concurrent, non-overflowed transactions and
non-transactional code can execute concurrently with
the overflowed transaction, providing more concurrency
while retaining OneTM's core simplifying assumption.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "concurrency; parallel programming; transactional
memory; transactions",
author = "Hans Boehm and Bill Pugh and Doug Lea",
title = "Multithreading in {C} and {C++}",
journal = j-LOGIN,
volume = "32",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
ISSN = "1044-6397",
ISSN-L = "1044-6397",
bibdate = "Fri Dec 7 11:34:27 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://www.usenix.org/publications/login/february-2007-volume-32-number-1/multithreading-c-and-c",
acknowledgement = ack-nhfb,
fjournal = ";login: the USENIX Association newsletter",
author = "Sebastian Burckhardt and Rajeev Alur and Milo M. K.
title = "{CheckFence}: checking consistency of concurrent data
types on relaxed memory models",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "12--21",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1250734.1250737",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Concurrency libraries can facilitate the development
of multi-threaded programs by providing concurrent
implementations of familiar data types such as queues
or sets. There exist many optimized algorithms that can
achieve superior performance on multiprocessors by
allowing concurrent data accesses without using locks.
Unfortunately, such algorithms can harbor subtle
concurrency bugs. Moreover, they require memory
ordering fences to function correctly on relaxed memory
To address these difficulties, we propose a
verification approach that can exhaustively check all
concurrent executions of a given test program on a
relaxed memory model and can verify that they are
observationally equivalent to a sequential execution.
Our CheckFence prototype automatically translates the C
implementation code and the test program into a SAT
formula, hands the latter to a standard SAT solver, and
constructs counter example traces if there exist
incorrect executions. Applying CheckFence to five
previously published algorithms, we were able to (1)
find several bugs (some not previously known), and (2)
determine how to place memory ordering fences for
relaxed memory models.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrent data structures; lock-free synchronization;
memory models; multi-threading; sequential consistency;
shared-memory multiprocessors; software model
author = "Dipankar Das and P. P. Chakrabarti and Rajeev Kumar",
title = "Functional verification of task partitioning for
multiprocessor embedded systems",
journal = j-TODAES,
volume = "12",
number = "4",
pages = "44:1--44:??",
month = sep,
year = "2007",
DOI = "https://doi.org/10.1145/1278349.1278357",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Thu Jun 12 18:09:35 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "With the advent of multiprocessor embedded platforms,
application partitioning and mapping have gained
primacy as a design step. The output of this design
step is a multithreaded partitioned application where
each thread is mapped to a processing element
(processor or ASIC) in the multiprocessor platform.
This partitioned application must be verified to be
consistent with the native unpartitioned application.
This verification task is called application (or task)
partitioning verification. \par
This work proposes a code-block-level
containment-checking -based methodology for application
partitioning verification. We use a UML-based
code-block-level modeling language which is rich enough
to model most designs. We formulate the application
partitioning verification problem as a special case of
the containment checking problem, which we call the
complete containment checking problem. We propose a
state space reduction technique specific to the
containment checking, reachability analysis, and
deadlock detection problems. We propose novel data
structures and token propagation methodologies which
enhance the efficiency of containment checking. We
present an efficient containment checking algorithm for
the application partitioning verification problem. We
develop a containment checking tool called TraceMatch
and present experimental results. We present a
comparison of the state space reduction achieved by
TraceMatch with that achieved by formal analysis and
verification tools like Spin, PEP, PROD, and LoLA.",
acknowledgement = ack-nhfb,
articleno = "44",
fjournal = "ACM Transactions on Design Automation of Electronic
Systems (TODAES)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
keywords = "Containment checking; multiprocessor embedded systems;
state space reduction; UML activity diagrams",
author = "Jialin Dou and Marcelo Cintra",
title = "A compiler cost model for speculative
journal = j-TACO,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250732",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Speculative parallelization is a technique that allows
code sections that cannot be fully analyzed by the
compiler to be aggressively executed in parallel.
However, while speculative parallelization can
potentially deliver significant speedups, several
overheads associated with this technique can limit
these speedups in practice. This paper proposes a novel
compiler static cost model of speculative multithreaded
execution that can be used to predict the resulting
performance. This model attempts to predict the
expected speedups, or slowdowns, of the candidate
speculative sections based on the estimation of the
combined runtime effects of various overheads, and
taking into account the scheduling restrictions of most
speculative execution environments. The model is based
on estimating the likely execution duration of threads
and considers all the possible permutations of these
threads. This model also produces a quantitative
estimate of the speedup, which is different from prior
heuristics that only qualitatively estimate the
benefits of speculative multithreaded execution. In
previous work, a limited version of the framework was
evaluated on a number of loops from a collection of
SPEC benchmarks that suffer mainly from load imbalance
and thread dispatch and commit overheads. In this work,
an extended framework is also evaluated on loops that
may suffer from data-dependence violations.
Experimental results show that prediction accuracy is
lower when loops with violations are included.
Nevertheless, accuracy is still very high for a static
model: the framework can identify, on average, 45\% of
the loops that cause slowdowns and, on average, 96\% of
the loops that lead to speedups; it predicts the
speedups or slowdowns with an error of less than 20\%
for an average of 28\% of the loops across the
benchmarks and with an error of less than 50\% for an
average of 80\% of the loops. Overall, the framework
often outperforms, by as much as 25\%, a naive approach
that attempts to speculatively parallelize all the
loops considered, and is able to curb the large
slowdowns caused in many cases by this naive
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "speculative multithreading; speculative
parallelization; thread-level speculation",
author = "Tayfun Elmas and Shaz Qadeer and Serdar Tasiran",
title = "{Goldilocks}: a race and transaction-aware {Java}
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "245--255",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1273442.1250762",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Data races often result in unexpected and erroneous
behavior. In addition to causing data corruption and
leading programs to crash, the presence of data races
complicates the semantics of an execution which might
no longer be sequentially consistent. Motivated by
these observations, we have designed and implemented a
Java runtime system that monitors program executions
and throws a DataRaceException when a data race is
about to occur. Analogous to other runtime exceptions,
the DataRaceException provides two key benefits. First,
accesses causing race conditions are interrupted and
handled before they cause errors that may be difficult
to diagnose later. Second, if no DataRaceException is
thrown in an execution, it is guaranteed to be
sequentially consistent. This strong guarantee helps to
rule out many concurrency-related possibilities as the
cause of erroneous behavior. When a DataRaceException
is caught, the operation, thread, or program causing it
can be terminated gracefully. Alternatively, the
DataRaceException can serve as a conflict-detection
mechanism in optimistic uses of concurrency.\par
We start with the definition of data-race-free
executions in the Java memory model. We generalize this
definition to executions that use transactions in
addition to locks and volatile variables for
synchronization. We present a precise and efficient
algorithm for dynamically verifying that an execution
is free of data races. This algorithm generalizes the
Goldilocks algorithm for data-race detection by
handling transactions and providing the ability to
distinguish between read and write accesses. We have
implemented our algorithm and the DataRaceException in
the Kaffe Java Virtual Machine. We have evaluated our
system on a variety of publicly available Java
benchmarks and a few microbenchmarks that combine
lock-based and transaction-based synchronization. Our
experiments indicate that our implementation has
reasonable overhead. Therefore, we believe that in
addition to being a debugging tool, the
DataRaceException may be a viable mechanism to enforce
the safety of executions of multithreaded Java
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "data-race detection; Java runtime; runtime monitoring;
software transactions",
author = "Joel Emer and Mark D. Hill and Yale N. Patt and Joshua
J. Yi and Derek Chiou and Resit Sendag",
title = "Single-Threaded vs. Multithreaded: Where Should We
journal = j-IEEE-MICRO,
volume = "27",
number = "6",
pages = "14--24",
month = nov # "\slash " # dec,
year = "2007",
DOI = "https://doi.org/10.1109/MM.2007.109",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Wed Jul 2 21:58:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Michael Emmi and Jeffrey S. Fischer and Ranjit Jhala
and Rupak Majumdar",
title = "Lock allocation",
journal = j-SIGPLAN,
volume = "42",
number = "1",
pages = "291--296",
month = jan,
year = "2007",
DOI = "https://doi.org/10.1145/1190216.1190260",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:53:14 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We introduce lock allocation, an automatic technique
that takes a multi-threaded program annotated with
atomic sections (that must be executed atomically), and
infers a lock assignment from global variables to locks
and a lock instrumentation that determines where each
lock should be acquired and released such that the
resulting instrumented program is guaranteed to
preserve atomicity and deadlock freedom (provided all
shared state is accessed only within atomic sections).
Our algorithm works in the presence of pointers and
procedures, and sets up the lock allocation problem as
a 0-1 ILP which minimizes the conflict cost between
atomic sections while simultaneously minimizing the
number of locks. We have implemented our algorithm for
both C with pthreads and Java, and have applied it to
infer locks in 15K lines of AOLserver code. Our
automatic allocation produces the same results as hand
annotations for most of this code, while solving the
optimization instances within a second for most
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "atomicity; ILP; lock inference",
author = "Yaniv Eytani and Klaus Havelund and Scott D. Stoller
and Shmuel Ur",
title = "Towards a framework and a benchmark for testing tools
for multi-threaded programs",
journal = j-CCPE,
volume = "19",
number = "3",
pages = "267--279",
day = "10",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1002/cpe.1068",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:10 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "1 Aug 2006",
author = "Ron Gabor and Shlomo Weiss and Avi Mendelson",
title = "Fairness enforcement in switch on event
journal = j-TACO,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275939",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The need to reduce power and complexity will increase
the interest in Switch On Event multithreading
(coarse-grained multithreading). Switch On Event
multithreading is a low-power and low-complexity
mechanism to improve processor throughput by switching
threads on execution stalls. Fairness may, however,
become a problem in a multithreaded processor. Unless
fairness is properly handled, some threads may starve
while others consume all of the processor cycles.
Heuristics that were devised in order to improve
fairness in simultaneous multithreading are not
applicable to Switch On Event multithreading. This
paper defines the fairness metric using the ratio of
the individual threads' speedups and shows how it can
be enforced in Switch On Event multithreading. Fairness
is controlled by forcing additional thread switch
points. These switch points are determined dynamically
by runtime estimation of the single threaded
performance of each of the individual threads. We
analyze the impact of the fairness enforcement
mechanism on aggregate IPC and weighted speedup. We
present simulation results of the performance of Switch
On Event multithreading. Switch On Event multithreading
achieves an average aggregate IPC increase of 26\% over
single thread and 12\% weighted speedup when no
fairness is enforced. In this case, a sixth of our runs
resulted in poor fairness in which one thread ran
extremely slowly (10 to 100 times slower than its
single-thread performance), while the other thread's
performance was hardly affected. By using the proposed
mechanism, we can guarantee fairness at different
levels of strictness and, in most cases, even improve
the weighted speedup.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "coarse-grained multithreading; fairness;
multithreading; performance; SOE; Switch on Event
multithreading; throughput; weighted speedup",
author = "Amol Ghoting and Gregory Buehrer and Srinivasan
Parthasarathy and Daehyun Kim and Anthony Nguyen and
Yen-Kuang Chen and Pradeep Dubey",
title = "Cache-conscious frequent pattern mining on modern and
emerging processors",
journal = j-VLDB-J,
volume = "16",
number = "1",
pages = "77--96",
month = jan,
year = "2007",
ISSN = "1066-8888 (print), 0949-877X (electronic)",
ISSN-L = "1066-8888",
bibdate = "Mon Jun 23 10:51:22 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Algorithms are typically designed to exploit the
current state of the art in processor technology.
However, as processor technology evolves, said
algorithms are often unable to derive the maximum
achievable performance on these modern architectures.
In this paper, we examine the performance of frequent
pattern mining algorithms on a modern processor. A
detailed performance study reveals that even the best
frequent pattern mining implementations, with highly
efficient memory managers, still grossly under-utilize
a modern processor. The primary performance bottlenecks
are {\em poor data locality\/} and {\em low instruction
level parallelism (ILP)}. We propose a {\em
cache-conscious prefix tree\/} to address this problem.
The resulting tree improves spatial locality and also
enhances the benefits from hardware cache line
prefetching. Furthermore, the design of this data
structure allows the use of {\em path tiling}, a novel
tiling strategy, to improve temporal locality. The
result is an overall speedup of up to 3.2 when compared
with state of the art implementations. We then show how
these algorithms can be improved further by realizing a
non-naive thread-based decomposition that targets {\em
simultaneously multi-threaded processors (SMT)}. A key
aspect of this decomposition is to ensure cache re-use
between threads that are co-scheduled at a fine
granularity. This optimization affords an additional
speedup of 50\%, resulting in an overall speedup of up
to 4.8. The proposed optimizations also provide
performance improvements on SMPs, and will most likely
be beneficial on emerging processors.",
acknowledgement = ack-nhfb,
fjournal = "VLDB Journal: Very Large Data Bases",
journal-URL = "http://portal.acm.org/toc.cfm?id=J869",
keywords = "architecture-conscious algorithms; association rule
mining; cache-conscious data mining; frequent itemset
mining; frequent pattern mining",
author = "Michael H. Goldwasser and David Letscher",
title = "Introducing network programming into a {CS1} course",
journal = j-SIGCSE,
volume = "39",
number = "3",
pages = "19--22",
month = sep,
year = "2007",
DOI = "https://doi.org/10.1145/1269900.1268793",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 16:57:36 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "Proceedings of the 12th Annual SIGCSE Conference on
Innovation and Technology in Computer Science Education
abstract = "Incorporating advanced programming concepts into an
introductory programming course has to be done
carefully to avoid overwhelming the students. We
describe our experiences doing network programming in a
CS1 course taught in Python. The simplicity of the
built-in libraries allowed a fair amount of networking
to be introduced in a week-long module of the course.
In this short time we had the students writing both
multithreaded clients and servers.",
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "Alexey Gotsman and Josh Berdine and Byron Cook and
Mooly Sagiv",
title = "Thread-modular shape analysis",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "266--277",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1273442.1250765",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present the first shape analysis for multithreaded
programs that avoids the explicit enumeration of
execution-interleavings. Our approach is to
automatically infer a resource invariant associated
with each lock that describes the part of the heap
protected by the lock. This allows us to use a
sequential shape analysis on each thread. We show that
resource invariants of a certain class can be
characterized as least fixed points and computed via
repeated applications of shape analysis only on each
individual thread. Based on this approach, we have
implemented a thread-modular shape analysis tool and
applied it to concurrent heap-manipulating code from
Windows device drivers.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "abstract interpretation; concurrent programming; shape
analysis; static analysis",
author = "George A. Gravvanis and Victor N. Epitropou and
Konstantinos M. Giannoutakis",
title = "On the performance of parallel approximate inverse
preconditioning using {Java} multithreading
journal = j-APPL-MATH-COMP,
volume = "190",
number = "1",
pages = "255--270",
day = "1",
month = jul,
year = "2007",
ISSN = "0096-3003 (print), 1873-5649 (electronic)",
ISSN-L = "0096-3003",
bibdate = "Sat Jul 12 09:03:06 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Applied Mathematics and Computation",
journal-URL = "http://www.sciencedirect.com/science/journal/00963003",
author = "Ibrahim Hur and Calvin Lin",
title = "Memory scheduling for modern microprocessors",
journal = j-TOCS,
volume = "25",
number = "4",
pages = "10:1--10:??",
month = dec,
year = "2007",
DOI = "https://doi.org/10.1145/1314299.1314301",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Mon Jun 16 17:52:15 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "The need to carefully schedule memory operations has
increased as memory performance has become increasingly
important to overall system performance. This article
describes the adaptive history-based (AHB) scheduler,
which uses the history of recently scheduled operations
to provide three conceptual benefits: (1) it allows the
scheduler to better reason about the delays associated
with its scheduling decisions, (2) it provides a
mechanism for combining multiple constraints, which is
important for increasingly complex DRAM structures, and
(3) it allows the scheduler to select operations so
that they match the program's mixture of Reads and
Writes, thereby avoiding certain bottlenecks within the
memory controller.\par
We have previously evaluated this scheduler in the
context of the IBM Power5. When compared with the state
of the art, this scheduler improves performance by
15.6\\%, 9.9\\%, and 7.6\\% for the Stream, NAS, and
commercial benchmarks, respectively. This article
expands our understanding of the AHB scheduler in a
variety of ways. Looking backwards, we describe the
scheduler in the context of prior work that focused
exclusively on avoiding bank conflicts, and we show
that the AHB scheduler is superior for the IBM Power5,
which we argue will be representative of future
microprocessor memory controllers. Looking forwards, we
evaluate this scheduler in the context of future
systems by varying a number of microarchitectural
features and hardware parameters. For example, we show
that the benefit of this scheduler increases as we move
to multithreaded environments.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "adaptive history-based scheduling; memory scheduling;
memory system performance",
author = "Giorgos Kollias and Efstratios Gallopoulos",
title = "Asynchronous {PageRank} computation in an interactive
multithreading environment",
volume = "07071",
publisher = "International Begegnungs- und Forschungszentrum
f{\"u}r Informatik",
address = "Wadern, Germany",
pages = "????",
year = "2007",
ISBN = "????",
ISBN-13 = "????",
bibdate = "Fri Feb 19 15:32:30 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
series = "Dagstuhl seminar proceedings",
URL = "http://drops.dagstuhl.de/opus/volltexte/2007/1065/pdf/07071.KolliasGiorgios.Paper.1065",
acknowledgement = ack-nhfb,
author = "Nagendra J. Kumar and Vasanth Asokan and Siddhartha
Shivshankar and Alexander G. Dean",
title = "Efficient software implementation of embedded
communication protocol controllers using asynchronous
software thread integration with time- and
space-efficient procedure calls",
journal = j-TECS,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1210268.1210270",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jun 12 15:20:58 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The overhead of context switching limits efficient
scheduling of multiple concurrent threads on a
uniprocessor when real-time requirements exist. A
software-implemented protocol controller may be
crippled by this problem. The available idle time may
be too short to recover through context switching, so
only the primary thread can execute during message
activity, slowing the secondary threads and potentially
missing deadlines. Asynchronous software thread
integration (ASTI) uses coroutine calls and
integration, letting threads make independent progress
efficiently, and reducing the needed context switches.
We demonstrate the methods with a software
implementation of an automotive communication protocol
(J1850) and several secondary threads.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
keywords = "asynchronous software thread integration; fine-grain
concurrency; hardware to software migration; J1850;
software-implemented communication protocol
author = "James Laudon and Lawrence Spracklen",
title = "The Coming Wave of Multithreaded Chip
journal = j-INT-J-PARALLEL-PROG,
volume = "35",
number = "3",
pages = "299--330",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1007/s10766-007-0033-6",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:06:21 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=3;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=3&spage=299",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Chip multiprocessing; multithreading; parallel
programming; performance",
author = "H. Q. Le and W. J. Starke and J. S. Fields and F. P.
O'Connell and D. Q. Nguyen and B. J. Ronchetti and W.
M. Sauer and E. M. Schwarz and M. T. Vaden",
title = "{IBM POWER6} microarchitecture",
journal = j-IBM-JRD,
volume = "51",
number = "6",
pages = "639--??",
month = nov,
year = "2007",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Mon Jul 7 21:49:07 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/rd/516/le.html",
abstract = "This paper describes the implementation of the IBM
POWER6 microprocessor, a two-way simultaneous
multithreaded (SMT) dual-core chip whose key features
include binary compatibility with IBM POWER5
microprocessor-based systems; increased functional
capabilities, such as decimal floating-point and vector
multimedia extensions; significant reliability,
availability, and serviceability enhancements; and
robust scalability with up to 64 physical processors.
Based on a new industry-leading high-frequency core
architecture with enhanced SMT and driven by a
high-throughput symmetric multiprocessing (SMP) cache
and memory subsystem, the POWER6 chip achieves a
significant performance boost compared with its
predecessor, the POWER5 chip. Key extensions to the
coherence protocol enable POWER6 microprocessor-based
systems to achieve better SMP scalability while
enabling reductions in system packaging complexity and
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
author = "P. Leadbitter and D. Page and N. P. Smart",
title = "Nondeterministic Multithreading",
journal = j-IEEE-TRANS-COMPUT,
volume = "56",
number = "7",
pages = "992--998",
month = jul,
year = "2007",
DOI = "https://doi.org/10.1109/TC.2007.1049",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Mon Jul 4 15:03:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4216296",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Peng Li and Steve Zdancewic",
title = "Combining events and threads for scalable network
services implementation and evaluation of monadic,
application-level concurrency primitives",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "189--199",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1273442.1250756",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper proposes to combine two seemingly opposed
programming models for building massively concurrent
network services: the event-driven model and the
multithreaded model. The result is a hybrid design that
offers the best of both worlds--the ease of use and
expressiveness of threads and the flexibility and
performance of events.\par
This paper shows how the hybrid model can be
implemented entirely at the application level using
concurrency monads in Haskell, which provides type-safe
abstractions for both events and threads. This approach
simplifies the development of massively concurrent
software in a way that scales to real-world network
services. The Haskell implementation supports
exceptions, symmetrical multiprocessing, software
transactional memory, asynchronous I/O mechanisms and
application-level network protocol stacks. Experimental
results demonstrate that this monad-based approach has
good performance: the threads are extremely lightweight
(scaling to ten million threads), and the I/O
performance compares favorably to that of Linux NPTL.
tens of thousands of simultaneous, mostly-idle client
connections. Such massively-concurrent programs are
difficult to implement, especially when other
requirements, such as high performance and strong
security, must also be met.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; event; Haskell; implementation; monad;
networking; programming; scalability; thread",
author = "Niti Madan and Rajeev Balasubramonian",
title = "Power Efficient Approaches to Redundant
volume = "18",
number = "8",
pages = "1066--1079",
month = aug,
year = "2007",
DOI = "https://doi.org/10.1109/TPDS.2007.1090",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Jul 3 14:26:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Aqeel Mahesri and Nicholas J. Wang and Sanjay J.
title = "Hardware support for software controlled
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "1",
pages = "3--12",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1145/1241601.1241606",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:47:26 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Chip multi-processors have emerged as one of the most
effective uses of the huge number of transistors
available today and in the future, but questions remain
as to the best way to leverage CMPs to accelerate
single threaded applications. Previous approaches rely
on significant speculation to accomplish this goal. Our
proposal, NXA, is less speculative than previous
proposals, relying heavily on software to guarantee
thread correctness, though still allowing parallelism
in the presence of ambiguous dependences. It divides a
single thread of execution into multiple using the
master-worker paradigm where some set of master threads
execute code that spawns tasks for other, worker
threads. The master threads generally consist of
performance critical instructions that can prefetch
data, compute critical control decisions, or compute
performance critical dataflow slices. This prevents
non-critical instructions from competing with critical
instructions for processor resources, allowing the
critical thread (and thus the workload) to complete
faster. Empirical results from performance simulation
show a 20\% improvement in performance on a 2-way CMP
machine, demonstrating that software controlled
multithreading can indeed provide a benefit in the
presence of hardware support.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "{DASCMP'06}",
author = "Ami Marowka",
title = "Parallel computing on any desktop",
journal = j-CACM,
volume = "50",
number = "9",
pages = "74--78",
month = sep,
year = "2007",
DOI = "https://doi.org/10.1145/1284621.1284622",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Mon Jun 16 18:32:57 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Parallelization lets applications exploit the high
throughput of new multicore processors, and the OpenMP
parallel programming model helps developers create
multithreaded applications.",
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "Chi Cao Minh and Martin Trautmann and JaeWoong Chung
and Austen McDonald and Nathan Bronson and Jared Casper
and Christos Kozyrakis and Kunle Olukotun",
title = "An effective hybrid transactional memory system with
strong isolation guarantees",
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "2",
pages = "69--80",
month = may,
year = "2007",
DOI = "https://doi.org/10.1145/1250662.1250673",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:48:43 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We propose signature-accelerated transactional memory
(SigTM), a hybrid TM system that reduces the overhead
of software transactions. SigTM uses hardware
signatures to track the read-set and write-set for
pending transactions and perform conflict detection
between concurrent threads. All other transactional
functionality, including data versioning, is
implemented in software. Unlike previously proposed
hybrid TM systems, SigTM requires no modifications to
the hardware caches, which reduces hardware cost and
simplifies support for nested transactions and
multithreaded processor cores. SigTM is also the first
hybrid TM system to provide strong isolation guarantees
between transactional blocks and non-transactional
accesses without additional read and write barriers in
non-transactional code.\par
Using a set of parallel programs that make frequent use
of coarse-grain transactions, we show that SigTM
accelerates software transactions by 30\% to 280\%. For
certain workloads, SigTM can match the performance of a
full-featured hardware TM system, while for workloads
with large read-sets it can be up to two times slower.
Overall, we show that SigTM combines the performance
characteristics and strong isolation guarantees of
hardware TM implementations with the low cost and
flexibility of software TM systems.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "multi-core architectures; parallel programming; strong
isolation; transactional memory",
author = "Marco Morandini and Paolo Mantegazza",
title = "Using dense storage to solve small sparse linear
journal = j-TOMS,
volume = "33",
number = "1",
pages = "5:1--5:12",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1145/1206040.1206045",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Sat Apr 14 09:48:58 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "A data structure is used to build a linear solver
specialized for relatively small sparse systems. The
proposed solver, optimized for run-time performance at
the expense of memory footprint, outperforms widely
used direct and sparse solvers for systems with between
100 and 3000 equations. A multithreaded version of the
solver is shown to give some speedups for problems with
medium fill-in, while it does not give any benefit for
very sparse problems.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Madanlal Musuvathi and Shaz Qadeer",
title = "Iterative context bounding for systematic testing of
multithreaded programs",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "446--455",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1273442.1250785",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreaded programs are difficult to get right
because of unexpected interaction between concurrently
executing threads. Traditional testing methods are
inadequate for catching subtle concurrency errors which
manifest themselves late in the development cycle and
post-deployment. Model checking or systematic
exploration of program behavior is a promising
alternative to traditional testing methods. However, it
is difficult to perform systematic search on large
programs as the number of possible program behaviors
grows exponentially with the program size. Confronted
with this state-explosion problem, traditional model
checkers perform iterative depth-bounded search.
Although effective for message-passing software,
iterative depth-bounding is inadequate for
multithreaded software.\par
This paper proposes iterative context-bounding, a new
search algorithm that systematically explores the
executions of a multithreaded program in an order that
prioritizes executions with fewer context switches. We
distinguish between preempting and nonpreempting
context switches, and show that bounding the number of
preempting context switches to a small number
significantly alleviates the state explosion, without
limiting the depth of explored executions. We show both
theoretically and empirically that context-bounded
search is an effective method for exploring the
behaviors of multithreaded programs. We have
implemented our algorithm in two model checkers and
applied it to a number of real-world multithreaded
programs. Our implementation uncovered 9 previously
unknown bugs in our benchmarks, each of which was
exposed by an execution with at most 2 preempting
context switches. Our initial experience with the
technique is encouraging and demonstrates that
iterative context-bounding is a significant improvement
over existing techniques for testing multithreaded
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; context-bounding; model checking;
multithreading; partial-order reduction; shared-memory
programs; software testing",
author = "Mayur Naik and Alex Aiken",
title = "Conditional must not aliasing for static race
journal = j-SIGPLAN,
volume = "42",
number = "1",
pages = "327--338",
month = jan,
year = "2007",
DOI = "https://doi.org/10.1145/1190216.1190265",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:53:14 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Race detection algorithms for multi-threaded programs
using the common lock-based synchronization idiom must
correlate locks with the memory locations they guard.
The heart of a proof of race freedom is showing that if
two locks are distinct, then the memory locations they
guard are also distinct. This is an example of a
general property we call conditional must not aliasing:
Under the assumption that two objects are not aliased,
prove that two other objects are not aliased. This
paper introduces and gives an algorithm for conditional
must not alias analysis and discusses experimental
results for sound race detection of Java programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; Java; multi-threading; static race
detection; synchronization",
author = "Satish Narayanasamy and Zhenghao Wang and Jordan
Tigani and Andrew Edwards and Brad Calder",
title = "Automatically classifying benign and harmful data
races all using replay analysis",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "22--31",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1250734.1250738",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Many concurrency bugs in multi-threaded programs are
due to dataraces. There have been many efforts to
develop static and dynamic mechanisms to automatically
find the data races. Most of the prior work has focused
on finding the data races and eliminating the false
In this paper, we instead focus on a dynamic analysis
technique to automatically classify the data races into
two categories --- the dataraces that are potentially
benign and the data races that are potentially harmful.
A harmful data race is a real bug that needs to be
fixed. This classification is needed to focus the
triaging effort on those data races that are
potentially harmful. Without prioritizing the data
races we have found that there are too many data races
to triage. Our second focus is to automatically provide
to the developer a reproducible scenario of the data
race, which allows the developer to understand the
different effects of a harmful data race on a program's
To achieve the above, we record a multi-threaded
program's execution in a replay log. The replay log is
used to replay the multi-threaded program, and during
replay we find the data races using a happens-before
based algorithm. To automatically classify if a data
race that we find is potentially benign or potentially
harmful, were play the execution twice for a given data
race --- one for each possible order between the
conflicting memory operations. If the two replays for
the two orders produce the same result, then we
classify the data race to be potentially benign. We
discuss our experiences in using our replay based
dynamic data race checker on several Microsoft
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "benign data races; concurrency Bbugs; replay",
author = "Chris Ostler and Karam S. Chatha and Vijay Ramamurthi
and Krishnan Srinivasan",
title = "{ILP} and heuristic techniques for system-level design
on network processor architectures",
journal = j-TODAES,
volume = "12",
number = "4",
pages = "48:1--48:??",
month = sep,
year = "2007",
DOI = "https://doi.org/10.1145/1278349.1278361",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Thu Jun 12 18:09:35 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "Network processors incorporate several architectural
features, including symmetric multiprocessing (SMP),
block multithreading, and multiple memory elements, to
support the high-performance requirements of current
day applications. This article presents automated
system-level design techniques for application
development on such architectures. We propose integer
linear programming formulations and heuristic
techniques for process allocation and data mapping on
SMP and block-multithreading-based network processors.
The techniques incorporate process transformations and
multithreading-aware data mapping to maximize the
throughput of the application. The article presents
experimental results that evaluate the techniques by
implementing network processing applications on the
Intel IXP 2400 architecture.",
acknowledgement = ack-nhfb,
articleno = "48",
fjournal = "ACM Transactions on Design Automation of Electronic
Systems (TODAES)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
keywords = "block multithreading; multiprocessor",
author = "Soyeon Park and Weihang Jiang and Yuanyuan Zhou and
Sarita Adve",
title = "Managing energy-performance tradeoffs for
multithreaded applications on multiprocessor
journal = j-SIGMETRICS,
volume = "35",
number = "1",
pages = "169--180",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1254882.1254902",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Jun 27 09:42:48 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In modern computers, non-performance metrics such as
energy consumption have become increasingly important,
requiring tradeoff with performance. A recent work has
proposed performance-guaranteed energy management, but
it is designed specifically for sequential applications
and cannot be used to a large class of multithreaded
applications running on high end computers and data
To address the above problem, this paper makes the
first attempt to provide performance-guaranteed energy
management for multithreaded applications on
multiprocessor architectures. We first conduct a
comprehensive study on the effects of energy adaptation
on thread synchronizations and show that a
multithreaded application suffers from not only local
slowdowns due to energy adaptation, but also
significant slowdowns propagated from other threads
because of synchronization. Based on these findings, we
design three Synchronization-Aware (SA) algorithms, LWT
(Lock Waiting Time-based), CSL (Critical Section
Length-based) and ODP (Operation Delay
Propagation-based) algorithms, to estimate the energy
adaptation-induced slowdowns on each thread. The local
slowdowns are then combined across multiple threads via
three aggregation methods (MAX, AVG and SUM) to
estimate the overall application slowdown.\par
We evaluate our methods using a large multithreaded
commercial application, IBM DB2 with
industrial-strength online transaction processing
(OLTP) workloads, and six SPLASH parallel scientific
applications. Our experimental results show that LWT
combined with the MAX aggregation method not only
controls the performance slow down within the specified
limits but also conserves the most energy.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "energy and performance tradeoffs; low power design;
memory energy management; multithreaded applications",
author = "Pratibha Permandla and Michael Roberson and
Chandrasekhar Boyapati",
title = "A type system for preventing data races and deadlocks
in the {Java Virtual Machine} language: 1",
journal = j-SIGPLAN,
volume = "42",
number = "7",
pages = "10--10",
month = jul,
year = "2007",
DOI = "https://doi.org/10.1145/1254766.1254768",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:57:50 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In previous work on SafeJava we presented a type
system extension to the Java source language that
statically prevents data races and deadlocks in
multithreaded programs. SafeJava is expressive enough
to support common programming patterns, its type
checking is fast and scalable, and it requires little
programming overhead. SafeJava thus offers a promising
approach for making multithreaded programs more
reliable. This paper presents a corresponding type
system extension for the Java virtual machine language
(JVML). We call the resulting language SafeJVML.
Well-typed SafeJVML programs are guaranteed to be free
of data races and deadlocks. Designing a corresponding
type system for JVML is important because most Java
code is shipped in the JVML format. Designing a
corresponding type system for JVML is nontrivial
because of important differences between Java and JVML.
In particular, the absence of block structure in JVML
programs and the fact that they do not use named local
variables the way Java programs do make the type
systems for Java and JVML significantly different. For
example, verifying absence of races and deadlocks in
JVML programs requires performing an alias analysis,
something that was not necessary for verifying absence
of races and deadlocks in Java programs. This paper
presents static and dynamic semantics for Safe JVML. It
also includes a proof that the SafeJVML type system is
sound and that it prevents data races and deadlocks. To
the best of our knowledge, this is the first type
system for JVML that statically ensures absence of
synchronization errors.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "data races; deadlocks; ownership types; SafeJava",
author = "Eli Pozniansky and Assaf Schuster",
title = "{MultiRace}: efficient on-the-fly data race detection
in multithreaded {C++} programs",
journal = j-CCPE,
volume = "19",
number = "3",
pages = "327--340",
day = "10",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1002/cpe.1064",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:10 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "1 Aug 2006",
author = "Grigore Ro{\c{s}}u and Koushik Sen",
title = "An instrumentation technique for online analysis of
multithreaded programs",
journal = j-CCPE,
volume = "19",
number = "3",
pages = "311--325",
day = "10",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1002/cpe.1066",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:10 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "1 Aug 2006",
author = "Joseph J. Sharkey and Dmitry V. Ponomarev",
title = "Exploiting Operand Availability for Efficient
Simultaneous Multithreading",
journal = j-IEEE-TRANS-COMPUT,
volume = "56",
number = "2",
pages = "208--223",
month = feb,
year = "2007",
DOI = "https://doi.org/10.1109/TC.2007.28",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Mon Jul 4 15:03:37 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4042681",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia
and Zhen Yang",
title = "{CMP} cache performance projection: accessibility vs.
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "1",
pages = "13--20",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1145/1241601.1241607",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:47:26 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Efficient utilizing on-chip storage space on
Chip-Multiprocessors (CMPs) has become an important
research topic. Tradeoffs between data accessibility
and effective on-chip capacity have been studied
extensively. It requires costly simulations to
understand a wide-spectrum of the design space. In this
paper, we first develop an abstract model for
understanding the performance impact with respect to
data replication. To overcome the lack of real-time
interactions among multiple cores in the abstract
model, we propose a global stack simulation strategy to
study the performance of a variety of cache
organizations on CMPs. The global stack logically
incorporates a shared stack and per-core private stacks
to collect shared/private reuse (stack) distances for
every memory reference in a single simulation pass.
With the collected reuse distances, performance in
terms of hits/misses and average memory access times
can be calculated for various cache organizations. We
verify the stack results against individual
execution-driven simulations that consider realistic
cache parameters and delays using a set of commercial
multithreaded workloads. The results show that stack
simulations can accurately model the performance of
various cache organizations. The single-pass stack
simulation results demonstrate that the effectiveness
of various techniques for optimizing the CMP on-chip
storage is closely related to the working sets of the
workloads as well as to the total cache sizes.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "CMP caches; data replication; performance modeling and
projection; stack simulation",
remark = "{DASCMP'06}",
author = "Yannis Smaragdakis and Anthony Kay and Reimer Behrends
and Michal Young",
title = "Transactions with isolation and cooperation",
journal = j-SIGPLAN,
volume = "42",
number = "10",
pages = "191--210",
month = oct,
year = "2007",
DOI = "https://doi.org/10.1145/1297027.1297042",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:00:28 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present the TIC (Transactions with Isolation and
Cooperation) model for concurrent programming. TIC adds
to standard transactional memory the ability for a
transaction to observe the effects of other threads at
selected points. This allows transactions to cooperate,
as well as to invoke nonrepeatable or irreversible
operations, such as I/O. Cooperating transactions run
the danger of exposing intermediate state and of having
other threads change the transaction's state. The TIC
model protects against unanticipated interference by
having the type system keep track of all operations
that may (transitively) violate the atomicity of a
transaction and require the programmer to establish
consistency at appropriate points. The result is a
programming model that is both general and simple. We
have used the TIC model to re-engineer existing
lock-based applications including a substantial
multi-threaded web mail server and a memory allocator
with coarse-grained locking. Our experience confirms
the features of the TIC model: It is convenient for the
programmer, while maintaining the benefits of
transactional memory.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "nested transactions; open-nesting; punctuation; TIC;
transactional memory",
author = "Dominic Sweetman",
title = "See {MIPS} Run",
publisher = pub-MORGAN-KAUFMANN,
address = pub-MORGAN-KAUFMANN:adr,
edition = "Second",
pages = "xix + 492",
year = "2007",
ISBN = "0-12-088421-6",
ISBN-13 = "978-0-12-088421-6",
LCCN = "QA76.9.A73 S88 2007",
bibdate = "Thu Jun 20 10:21:55 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/linux.bib;
acknowledgement = ack-nhfb,
keywords = "Embedded computer systems --- Programming; MIPS
(Computer architecture); RISC microprocessors",
libnote = "Not yet in my library.",
tableofcontents = "1: RISCs and MIPS architectures / 1 \\
2: MIPS architecture / 29 \\
3: Coprocessor 0: MIPS processor control / 53 \\
4: How caches work on MIPS processors / 79 \\
5: Exceptions, interrupts, and initialization / 105 \\
6: Low-level memory management and the TLB / 131 \\
7: Floating-point support / 151 \\
8: Complete guide to the MIPS instruction set / 183 \\
9: Reading MIPS assembly language / 263 \\
10: Porting software to the MIPS architecture / 279 \\
11: MIPS software standards (ABIs) / 311 \\
12: Debugging MIPS designs - debug and profiling
features / 339 \\
13: GNU/Linux from eight miles high / 363 \\
14: How hardware and software work together / 371 \\
15: MIPS specific issues in the Linux kernel / 399 \\
16: Linux application code, PIC, and libraries / 409
Appendix A: MIPS multithreading / 415 \\
Appendix B: Other optional extensions to the MIPS
instruction set",
author = "David Tam and Reza Azimi and Michael Stumm",
title = "Thread clustering: sharing-aware scheduling on
{SMP--CMP--SMT} multiprocessors",
journal = j-OPER-SYS-REV,
volume = "41",
number = "3",
pages = "47--58",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1272996.1273004",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:16:31 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The major chip manufacturers have all introduced chip
multiprocessing (CMP) and simultaneous multithreading
(SMT) technology into their processing units. As a
result, even low-end computing systems and game
consoles have become shared memory multiprocessors with
L1 and L2 cache sharing within a chip. Mid- and
large-scale systems will have multiple processing chips
and hence consist of an SMP-CMP-SMT configuration with
non-uniform data sharing overheads. Current operating
system schedulers are not aware of these new cache
organizations, and as a result, distribute threads
across processors in a way that causes many
unnecessary, long-latency cross-chip cache
In this paper we describe the design and implementation
of a scheme to schedule threads based on sharing
patterns detected online using features of standard
performance monitoring units (PMUs) available in
today's processing units. The primary advantage of
using the PMU infrastructure is that it is fine-grained
(down to the cache line) and has relatively low
overhead. We have implemented our scheme in Linux
running on an 8- way Power5 SMP-CMP-SMT
multi-processor. For commercial multithreaded server
workloads (VolanoMark, SPECjbb, and RUBiS), we are able
to demonstrate reductions in cross-chip cache accesses
of up to 70\%. These reductions lead to
application-reported performance improvements of up to
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "affinity scheduling; cache behavior; cache locality;
CMP; detecting sharing; hardware performance counters;
hardware performance monitors; multithreading;
performance monitoring unit; resource allocation;
shared caches; sharing; simultaneous multithreading;
single-chip multiprocessors; SMP; SMT; thread
migration; thread placement; thread scheduling",
author = "Kristen R. Walcott and Greg Humphreys and Sudhanva
title = "Dynamic prediction of architectural vulnerability from
microarchitectural state",
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "2",
pages = "516--527",
month = may,
year = "2007",
DOI = "https://doi.org/10.1145/1250662.1250726",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:48:43 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Transient faults due to particle strikes are a key
challenge in microprocessor design. Driven by
exponentially increasing transistor counts, per-chip
faults are a growing burden. To protect against soft
errors, redundancy techniques such as redundant
multithreading (RMT) are often used. However, these
techniques assume that the probability that a
structural fault will result in a soft error (i.e., the
Architectural Vulnerability Factor (AVF)) is 100
percent, unnecessarily draining processor resources.
Due to the high cost of redundancy, there have been
efforts to throttle RMT at runtime. To date, these
methods have not incorporated an AVF model and
therefore tend to be ad hoc. Unfortunately, computing
the AVF of complex microprocessor structures (e.g., the
ISQ) can be quite involved.\par
To provide probabilistic guarantees about fault
tolerance, we have created a rigorous characterization
of AVF behavior that can be easily implemented in
hardware. We experimentally demonstrate AVF variability
within and across the SPEC2000 benchmarks and identify
strong correlations between structural AVF values and a
small set of processor metrics. Using these simple
indicators as predictors, we create a proof-of-concept
RMT implementation that demonstrates that AVF
prediction can be used to maintain a low fault
tolerance level without significant performance
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "architecture vulnerability factor; microarchitecture;
performance; redundant multithreading; reliability",
author = "Perry H. Wang and Jamison D. Collins and Gautham N.
Chinya and Hong Jiang and Xinmin Tian and Milind Girkar
and Nick Y. Yang and Guei-Yuan Lueh and Hong Wang",
title = "{EXOCHI}: architecture and programming environment for
a heterogeneous multi-core multithreaded system",
journal = j-SIGPLAN,
volume = "42",
number = "6",
pages = "156--166",
month = jun,
year = "2007",
DOI = "https://doi.org/10.1145/1250734.1250753",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:55:30 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Future mainstream microprocessors will likely
integrate specialized accelerators, such as GPUs, onto
a single die to achieve better performance and power
efficiency. However, it remains a keen challenge to
program such a heterogeneous multicore platform, since
these specialized accelerators feature ISAs and
functionality that are significantly different from the
general purpose CPU cores. In this paper, we present
EXOCHI: (1) Exoskeleton Sequencer (EXO), an
architecture to represent heterogeneous accelerators as
ISA-based MIMD architecture resources, and a shared
virtual memory heterogeneous multithreaded program
execution model that tightly couples specialized
accelerator cores with general-purpose CPU cores, and
(2) C for Heterogeneous Integration (CHI), an
integrated C/C++ programming environment that supports
accelerator-specific inline assembly and
domain-specific languages. The CHI compiler extends the
OpenMP pragma for heterogeneous multithreading
programming, and produces a single fat binary with code
sections corresponding to different instruction sets.
The runtime can judiciously spread parallel computation
across the heterogeneous cores to optimize performance
and power.\par
We have prototyped the EXO architecture on a physical
heterogeneous platform consisting of an Intel{\reg}
Core{\TM} 2 Duo processor and an 8-core 32-thread
Intel{\reg} Graphics Media Accelerator X3000. In
addition, we have implemented the CHI integrated
programming environment with the Intel{\reg} C++
Compiler, runtime toolset, and debugger. On the EXO
prototype system, we have enhanced a suite of
production-quality media kernels for video and image
processing to utilize the accelerator through the CHI
programming interface, achieving significant speedup
(1.41X to10.97X) over execution on the IA32 CPU
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "GPU; heterogeneous multi-cores; openMP",
author = "Qin Wang and Junpu Chen and Weihua Zhang and Min Yang
and Binyu Zang",
title = "Optimizing software cache performance of packet
processing applications",
journal = j-SIGPLAN,
volume = "42",
number = "7",
pages = "227--236",
month = jul,
year = "2007",
DOI = "https://doi.org/10.1145/1273444.1254808",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 10:57:50 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Network processors (NPs) are widely used in many types
of networking equipment due to their high performance
and flexibility. For most NPs, software cache is used
instead of hardware cache due to the chip area, cost
and power constraints. Therefore, programmers should
take full responsibility for software cache management
which is neither intuitive nor easy to most of them.
Actually, without an effective use of it, long memory
access latency will be a critical limiting factor to
overall applications. Prior researches like hardware
multi-threading, wide-word accesses and packet access
combination for caching have already been applied to
help programmers to overcome this bottleneck. However,
most of them do not make enough use of the
characteristics of packet processing applications and
often perform intraprocedural optimizations only. As a
result, the binary codes generated by those techniques
often get lower performance than that comes from
hand-tuned assembly programming for some applications.
In this paper, we propose an algorithm including two
techniques --- Critical Path Based Analysis (CPBA) and
Global Adaptive Localization (GAL), to optimize the
software cache performance of packet processing
applications. Packet processing applications usually
have several hot paths and CPBA tries to insert
localization instructions according to their execution
frequencies. For further optimizations, GAL eliminates
some redundant localization instructions by
interprocedural analysis and optimizations. Our
algorithm is applied on some representative
applications. Experiment results show that it leads to
an average speedup by a factor of 1.974.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "local memory; network processor; optimization",
author = "Jun Yan and Wei Zhang",
title = "Hybrid multi-core architecture for boosting
single-threaded performance",
journal = j-COMP-ARCH-NEWS,
volume = "35",
number = "1",
pages = "141--148",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1145/1241601.1241603",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:47:26 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The scaling of technology and the diminishing return
of complicated uniprocessors have driven the industry
towards multicore processors. While multithreaded
applications can naturally leverage the enhanced
throughput of multi-core processors, a large number of
important applications are single-threaded, which
cannot automatically harness the potential of
multi-core processors. In this paper, we propose a
compiler-driven heterogeneous multicore architecture,
consisting of tightly-integrated VLIW (Very Long
Instruction Word) and superscalar processors on a
single chip, to automatically boost the performance of
single-threaded applications without compromising the
capability to support multithreaded programs. In the
proposed multi-core architecture, while the
high-performance VLIW core is used to run code segments
with high instruction-level parallelism (ILP) extracted
by the compiler; the superscalar core can be exploited
to deal with the runtime events that are typically
difficult for the VLIW core to handle, such as L2 cache
misses. Our initial experimental results by running the
preexecution thread on the superscalar core to mitigate
the L2 cache misses of the main thread on the VLIW core
indicate that the proposed VLIW/superscalar multi-core
processor can automatically improve the performance of
single-threaded general-purpose applications by up to
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jin-Min Yang and Da-Fang Zhang and Xue-Dong Yang and
Wen-Wei Li",
title = "Reliable user-level rollback recovery implementation
for multithreaded processes on windows",
journal = j-SPE,
volume = "37",
number = "3",
pages = "331--346",
month = mar,
year = "2007",
DOI = "https://doi.org/10.1002/spe.771",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Wed Oct 17 18:33:14 MDT 2007",
bibsource = "http://www.interscience.wiley.com/jpages/0038-0644;
acknowledgement = ack-nhfb,
fjournal = "Software---Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "24 Oct 2006",
author = "J. Zebchuk and A. Moshovos",
title = "A Building Block for Coarse-Grain Optimizations in the
On-Chip Memory Hierarchy",
volume = "6",
number = "2",
pages = "33--36",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Current on-chip block-centric memory hierarchies
exploit access patterns at the fine-grain scale of
small blocks. Several recently proposed memory
hierarchy enhancements for coherence traffic reduction
and prefetching suggest that additional useful patterns
emerge with a macroscopic, coarse-grain view. This
paper presents RegionTracker, a dual-grain, on-chip
cache design that exposes coarse-grain behavior while
maintaining block-level communication. RegionTracker
eliminates the extraneous, often imprecise coarse-grain
tracking structures of previous proposals. It can be
used as the building block for coarse-grain
optimizations, reducing their overall cost and easing
their adoption. Using full-system simulation of a
quad-core chip multiprocessor and commercial workloads,
we demonstrate that RegionTracker overcomes the
inefficiencies of previous coarse-grain cache designs.
We also demonstrate how RegionTracker boosts the
benefits and reduces the cost of a previously proposed
snoop reduction technique.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; Bandwidth; cache storage; Cache
storage; coarse-grain optimizations; coherence traffic
reduction; Cost function; Design optimization;
Explosions; Information management; Memory management;
Multithreading; on-chip memory hierarchy; optimising
compilers; Prefetching; prefetching; Proposals;
quad-core chip multiprocessor; RegionTracker dual-grain
on-chip cache design; system-on-chip",
author = "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and
Mats Kindahl",
title = "Model checking race-freeness",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "5",
pages = "72--79",
month = dec,
year = "2008",
DOI = "https://doi.org/10.1145/1556444.1556454",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 26 11:50:56 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "With the introduction of highly concurrent systems in
standard desktop computers, ensuring correctness of
industrial-size concurrent programs is becoming
increasingly important. One of the most important
standards in use for developing multi-threaded programs
is the POSIX Threads standard, commonly known as
PThreads. Of particular importance, the analysis of
industrial code should, as far as possible, be
automatic and not require annotations or other forms of
specifications of the code.\par
Model checking has been one of the most successful
approaches to program verification during the last two
decades. The size and complexity of applications which
can be handled have increased rapidly through
integration with symbolic techniques. These methods are
designed to work on finite (but large) state spaces.
This framework fails to deal with several essential
aspects of behaviours for multithreaded programs: there
is no bound a priori on the number of threads which may
arise in a given run of the system; each thread
manipulates local variables which often range over
unbounded domains; and the system has a dynamic
structure in the sense that threads can be created and
killed throughout execution of the system. In this
paper we concentrate on checking a particular class of
properties for concurrent programs, namely safety
properties. In particular, we focus on race-freeness,
that is, the absence of race conditions (also known as
data races) in shared-variable pthreaded
We will follow a particular methodology which we have
earlier developed for model checking general classes of
infinite-state systems [1, 3, 6, 8, 9] and apply a
symbolic backward reachability analysis to verify the
safety property. Since we construct a model as an
over-approximation of the original program, proving the
safety property in the model implies that the property
also holds in the original system. Surprisingly, it
leads to a quite efficient analysis which can be
carried out fully automatically.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Erika {\'A}brah{\'a}m and Frank S. de Boer and
Willem-Paul de Roever and Martin Steffen",
title = "A Deductive Proof System for Multithreaded {Java} with
journal = j-FUND-INFO,
volume = "82",
number = "4",
pages = "391--463",
month = jul,
year = "2008",
ISSN = "0169-2968 (print), 1875-8681 (electronic)",
ISSN-L = "0169-2968",
bibdate = "Sat Mar 5 17:06:39 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib;
acknowledgement = ack-nhfb,
fjournal = "Fundamenta Informaticae",
journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae",
author = "Michael D. Adams and R. Kent Dybvig",
title = "Efficient nondestructive equality checking for trees
and graphs",
journal = j-SIGPLAN,
volume = "43",
number = "9",
pages = "179--188",
month = sep,
year = "2008",
DOI = "https://doi.org/10.1145/1411203.1411230",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Sep 23 17:31:25 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The Revised$^6$ Report on Scheme requires its generic
equivalence predicate, equal?, to terminate even on
cyclic inputs. While the terminating equal? can be
implemented via a DFA-equivalence or union-find
algorithm, these algorithms usually require an
additional pointer to be stored in each object, are not
suitable for multithreaded code due to their
destructive nature, and may be unacceptably slow for
the small acyclic values that are the most likely
inputs to the predicate.\par
This paper presents a variant of the union-find
algorithm for equal? that addresses these issues. It
performs well on large and small, cyclic and acyclic
inputs by interleaving a low-overhead algorithm that
terminates only for acyclic inputs with a more general
algorithm that handles cyclic inputs. The algorithm
terminates for all inputs while never being more than a
small factor slower than whichever of the acyclic or
union-find algorithms would have been faster. Several
intermediate algorithms are also presented, each of
which might be suitable for use in a particular
application, though only the final algorithm is
suitable for use in a library procedure, like equal?,
that must work acceptably well for all inputs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "dfa equivalence; eq hash tables; equality; scheme;
author = "Kunal Agrawal and Charles E. Leiserson and Yuxiong He
and Wen Jing Hsu",
title = "Adaptive work-stealing with parallelism feedback",
journal = j-TOCS,
volume = "26",
number = "3",
pages = "7:1--7:32",
month = sep,
year = "2008",
DOI = "https://doi.org/10.1145/1394441.1394443",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Sep 17 14:28:13 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "Multiprocessor scheduling in a shared multiprogramming
environment can be structured as two-level scheduling,
where a kernel-level job scheduler allots processors to
jobs and a user-level thread scheduler schedules the
work of a job on its allotted processors. We present a
randomized work-stealing thread scheduler for fork-join
multithreaded jobs that provides continual parallelism
feedback to the job scheduler in the form of requests
for processors. Our A-STEAL algorithm is appropriate
for large parallel servers where many jobs share a
common multiprocessor resource and in which the number
of processors available to a particular job may vary
during the job's execution. Assuming that the job
scheduler never allots a job more processors than
requested by the job's thread scheduler, A-STEAL
guarantees that the job completes in near-optimal time
while utilizing at least a constant fraction of the
allotted processors.\par
We model the job scheduler as the thread scheduler's
adversary, challenging the thread scheduler to be
robust to the operating environment as well as to the
job scheduler's administrative policies. For example,
the job scheduler might make a large number of
processors available exactly when the job has little
use for them. To analyze the performance of our
adaptive thread scheduler under this stringent
adversarial assumption, we introduce a new technique
called {\em trim analysis,\/} which allows us to prove
that our thread scheduler performs poorly on no more
than a small number of time steps, exhibiting
near-optimal behavior on the vast majority.\par
More precisely, suppose that a job has work $ T_1 $ and
span $ T_\infty $. On a machine with $P$ processors,
A-STEAL completes the job in an expected duration of $
O(T_1 / \tilde {P} + T_\infty + L \lg P)$ time steps,
where $L$ is the length of a scheduling quantum, and $
\tilde {P}$ denotes the $ O(T_\infty + L \lg
P)$-trimmed availability. This quantity is the average
of the processor availability over all time steps
except the $ O(T_\infty + L \lg P)$ time steps that
have the highest processor availability. When the job's
parallelism dominates the trimmed availability, that
is, $ \tilde {P} \ll T_1 / T_\infty $, the job achieves
nearly perfect linear speedup. Conversely, when the
trimmed mean dominates the parallelism, the asymptotic
running time of the job is nearly the length of its
span, which is optimal.\par
We measured the performance of A-STEAL on a simulated
multiprocessor system using synthetic workloads. For
jobs with sufficient parallelism, our experiments
confirm that A-STEAL provides almost perfect linear
speedup across a variety of processor availability
profiles. We compared A-STEAL with the ABP algorithm,
an adaptive work-stealing thread scheduler developed by
Arora et al. [1998] which does not employ parallelism
feedback. On moderately to heavily loaded machines with
large numbers of processors, A-STEAL typically
completed jobs more than twice as quickly as ABP,
despite being allotted the same number or fewer
processors on every step, while wasting only 10\% of
the processor cycles wasted by ABP.",
acknowledgement = ack-nhfb,
articleno = "7",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "adaptive scheduling; adversary; instantaneous
parallelism; job scheduling; multiprocessing;
multiprogramming; parallel computation; parallelism
feedback; processor allocation; randomized algorithm;
space sharing; span; thread scheduling; trim analysis;
two-level scheduling; work; work-stealing",
author = "Zachary Anderson and David Gay and Rob Ennals and Eric
title = "{SharC}: checking data sharing strategies for
multithreaded {C}",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "149--158",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1379022.1375600",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Mar 11 17:33:54 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Unintended or unmediated data sharing is a frequent
cause of insidious bugs in multithreaded programs. We
present a tool called SharC (short for Sharing Checker)
that allows a user to write lightweight annotations to
declare how they believe objects are being shared
between threads in their program. SharC uses a
combination of static and dynamic analyses to check
that the program conforms to this
SharC allows any type to have one of five 'sharing
modes' -- private to the current thread, read-only,
shared under the control of a specified lock,
intentionally racy, or checked dynamically. The dynamic
mode uses run-time checking to verify that objects are
either read-only, or only accessed by one thread. This
allows us to check programs that would be difficult to
check with a purely static system. If the user does not
give a type an explicit annotation, then SharC uses a
static type-qualifier analysis to infer that it is
either private or should be checked
SharC allows objects to move between different sharing
modes at runtime by using reference counting to check
that there are no other references to the objects when
they change mode.\par
SharC's baseline dynamic analysis can check any C
program, but is slow, and will generate false warnings
about intentional data sharing. As the user adds more
annotations, false warnings are reduced, and
performance improves. We have found in practice that
very few annotations are needed to describe all sharing
and give reasonable performance. We ran SharC on 6
legacy C programs, summing to over 600k lines of code,
and found that a total of only 60 simple annotations
were needed to remove all false positives and to reduce
performance overhead to only 2-14\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "data-race",
author = "Evangelia Athanasaki and Nikos Anastopoulos and
Kornilios Kourtis and Nectarios Koziris",
title = "Exploring the performance limits of simultaneous
multithreading for memory intensive applications",
volume = "44",
number = "1",
pages = "64--97",
month = apr,
year = "2008",
DOI = "https://doi.org/10.1007/s11227-007-0149-x",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Jul 9 17:32:34 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=1;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=1&spage=64",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
keywords = "Instruction-level parallelism; Performance analysis;
Simultaneous multithreading; Software prefetching;
Speculative precomputation; Thread-level parallelism",
author = "Joshua Auerbach and David F. Bacon and Rachid
Guerraoui and Jesper Honig Spring and Jan Vitek",
title = "Flexible task graphs: a unified restricted thread
programming model for {Java}",
journal = j-SIGPLAN,
volume = "43",
number = "7",
pages = "1--11",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1145/1375657.1375659",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:05:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The disadvantages of unconstrained shared-memory
multi-threading in Java, especially with regard to
latency and determinism in realtime systems, have given
rise to a variety of language extensions that place
restrictions on how threads allocate, share, and
communicate memory, leading to order-of-magnitude
reductions in latency and jitter. However, each model
makes different trade-offs with respect to
expressiveness, efficiency, enforcement, and latency,
and no one model is best for all applications.\par
In this paper we present Flexible Task Graphs
(Flexotasks), a single system that allows different
isolation policies and mechanisms to be combined in an
orthogonal manner, subsuming four previously proposed
models as well as making it possible to use new
combinations best suited to the needs of particular
applications. We evaluate our implementation on top of
the IBM Web-Sphere Real Time Java virtual machine using
both a microbenchmark and a 30 KLOC avionics collision
detector. We show that Flexotasks are capable of
executing periodic threads at 10 KHz with a standard
deviation of 1.2$ \mu $ s and that it achieves
significantly better performance than RTSJ's scoped
memory constructs while remaining impervious to
interference from global garbage collection.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Java Virtual Machine; memory management; ownership
types; real-time systems",
author = "Helge Bahmann and Konrad Froitzheim",
title = "Extending futex for kernel to user notification",
journal = j-OPER-SYS-REV,
volume = "42",
number = "5",
pages = "18--26",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1145/1400097.1400100",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Wed Aug 6 16:54:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Threads in reactive applications need to service a
multitude of events from different sources such as
device drivers, communication channels or cooperating
threads. While notification about these events can
conceptually be understood as a form of
'synchronization', most operating systems (including
Linux) do not provide a unified abstraction. This paper
proposes to separate event delivery and notification,
and to provide unified event notification through
general-purpose synchronization objects. It
demonstrates how this unified mechanism can be
implemented in Linux as an extension of the futex
mechanism to allow notification from kernel-space.
Required modifications are discussed and their impact
is assessed. The new event notification mechanism
allows to move many thread activation policy decisions
into user-space, with benefits for multi-threaded
reactive applications: This is demonstrated in a
modification of the leader/followers pattern with
considerable performance benefits.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "event notification; followers; futex; leader {\&}
author = "Hans-J. Boehm and Sarita V. Adve",
title = "Foundations of the {C++} concurrency memory model",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "68--78",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1379022.1375591",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Currently multi-threaded C or C++ programs combine a
single-threaded programming language with a separate
threads library. This is not entirely sound [7].\par
We describe an effort, currently nearing completion, to
address these issues by explicitly providing semantics
for threads in the next revision of the C++ standard.
Our approach is similar to that recently followed by
Java [25], in that, at least for a well-defined and
interesting subset of the language, we give
sequentially consistent semantics to programs that do
not contain data races. Nonetheless, a number of our
decisions are often surprising even to those familiar
with the Java effort:\par
We (mostly) insist on sequential consistency for
race-free programs, in spite of implementation issues
that came to light after the Java work.\par
We give no semantics to programs with data races. There
are no benign C++ data races.\par
We use weaker semantics for trylock than existing
languages or libraries, allowing us to promise
sequential consistency with an intuitive race
definition, even for programs with trylock.\par
This paper describes the simple model we would like to
be able to provide for C++ threads programmers, and
explain how this, together with some practical, but
often under-appreciated implementation constraints,
drives us towards the above decisions.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "c++; data race; memory consistency; memory model;
sequential consistency; trylock",
author = "Carlos Boneti and Francisco J. Cazorla and Roberto
Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and
Mateo Valero",
title = "Software-Controlled Priority Characterization of
{POWER5} Processor",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "3",
pages = "415--426",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1109/ISCA.2008.8",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Aug 6 08:35:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Due to the limitations of instruction-level
parallelism, thread-level parallelism has become a
popular way to improve processor performance. One
example is the IBM POWER5TM processor, a two-context
simultaneous-multithreaded dual-core chip. In each SMT
core, the IBM POWER5 features two levels of thread
resource balancing and prioritization. The first level
provides automatic in-hardware resource balancing,
while the second level is a software-controlled
priority mechanism that presents eight levels of thread
priorities. Currently, software-controlled
prioritization is only used in limited number of cases
in the software platforms due to lack of performance
characterization of the effects of this mechanism. In
this work, we characterize the effects of the
software-based prioritization on several different
workloads. We show that the impact of the
prioritization significantly depends on the workloads
coscheduled on a core. By prioritizing the right task,
it is possible to obtain more than two times of
throughput improvement for synthetic workloads compared
to the baseline. We also present two application case
studies targeting two different performance metrics:
the first case study improves overall throughput by
23.7\% and the second case study reduces the total
execution time by 9.3\%. In addition, we show the
circumstances when a background thread can be run
transparently without affecting the performance of the
foreground thread.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "IBM POWER5; performance characterization; simultaneous
multithreading; SMT; software-controlled
author = "Simone Campanoni and Giovanni Agosta and Stefano
Crespi Reghizzi",
title = "A parallel dynamic compiler for {CIL} bytecode",
journal = j-SIGPLAN,
volume = "43",
number = "4",
pages = "11--20",
month = apr,
year = "2008",
DOI = "https://doi.org/10.1145/1374752.1374754",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:46 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multi-core technology is being employed in most recent
high-performance architectures. Such architectures need
specifically designed multi-threaded software to
exploit all the potentialities of their hardware
At the same time, object code virtualization
technologies are achieving a growing popularity, as
they allow higher levels of software portability and
Thus, a virtual execution environment running on a
multi-core processor has to run complex, high-level
applications and to exploit as much as possible the
underlying parallel hardware. We propose an approach
that leverages on CMP features to expose a novel
pipeline synchronization model for the internal threads
of the dynamic compiler.\par
Thanks to compilation latency masking effect of the
pipeline organization, our dynamic compiler, ILDJIT, is
able to achieve significant speedups (26\% on average)
with respect to the baseline, when the underlying
hardware exposes at least two cores.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "dynamic compilation; parallel virtual machine; virtual
execution system",
author = "Bumyong Choi and Leo Porter and Dean M. Tullsen",
title = "Accurate branch prediction for short threads",
journal = j-OPER-SYS-REV,
volume = "42",
number = "2",
pages = "125--134",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353534.1346298",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:20:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Multi-core processors, with low communication costs
and high availability of execution cores, will increase
the use of execution and compilation models that use
short threads to expose parallelism. Current branch
predictors seek to incorporate large amounts of control
flow history to maximize accuracy. However, when that
history is absent the predictor fails to work as
intended. Thus, modern predictors are almost useless
for threads below a certain length.\par
Using a Speculative Multithreaded (SpMT) architecture
as an example of a system which generates shorter
threads, this work examines techniques to improve
branch prediction accuracy when a new thread begins to
execute on a different core. This paper proposes a
minor change to the branch predictor that gives
virtually the same performance on short threads as an
idealized predictor that incorporates unknowable
pre-history of a spawned speculative thread. At the
same time, strong performance on long threads is
preserved. The proposed technique sets the global
history register of the spawned thread to the initial
value of the program counter. This novel and simple
design reduces branch mispredicts by 29\% and provides
as much as a 13\% IPC improvement on selected SPEC2000
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "branch prediction; chip multiprocessors",
author = "Ravi Chugh and Jan W. Voung and Ranjit Jhala and Sorin
title = "Dataflow analysis for concurrent programs using
datarace detection",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "316--326",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1375581.1375620",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Dataflow analyses for concurrent programs differ from
their single-threaded counterparts in that they must
account for shared memory locations being overwritten
by concurrent threads. Existing dataflow analysis
techniques for concurrent programs typically fall at
either end of a spectrum: at one end, the analysis
conservatively kills facts about all data that might
possibly be shared by multiple threads; at the other
end, a precise thread-interleaving analysis determines
which data may be shared, and thus which dataflow facts
must be invalidated. The former approach can suffer
from imprecision, whereas the latter does not
We present RADAR, a framework that automatically
converts a dataflow analysis for sequential programs
into one that is correct for concurrent programs. RADAR
uses a race detection engine to kill the dataflow
facts, generated and propagated by the sequential
analysis, that become invalid due to concurrent writes.
Our approach of factoring all reasoning about
concurrency into a race detection engine yields two
benefits. First, to obtain analyses for code using new
concurrency constructs, one need only design a suitable
race detection engine for the constructs. Second, it
gives analysis designers an easy way to tune the
scalability and precision of the overall analysis by
only modifying the race detection engine. We describe
the RADAR framework and its implementation using a
pre-existing race detection engine. We show how RADAR
was used to generate a concurrent version of a
null-pointer dereference analysis, and we analyze the
result of running the generated concurrent analysis on
several benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "interprocedural analysis; locksets; multithreaded
programs; summaries",
author = "Matthew Curtis-Maury and Filip Blagojevic and Christos
D. Antonopoulos and Dimitrios S. Nikolopoulos",
title = "Prediction-Based Power-Performance Adaptation of
Multithreaded Scientific Codes",
volume = "19",
number = "10",
pages = "1396--1410",
month = oct,
year = "2008",
DOI = "https://doi.org/10.1109/TPDS.2007.70804",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu May 13 12:06:56 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Alan D. Fekete",
title = "Teaching students to develop thread-safe {Java}
journal = j-SIGCSE,
volume = "40",
number = "3",
pages = "119--123",
month = sep,
year = "2008",
DOI = "https://doi.org/10.1145/1597849.1384304",
ISSN = "0097-8418 (print), 2331-3927 (electronic)",
ISSN-L = "0097-8418",
bibdate = "Sat Nov 17 15:44:14 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/csharp.bib;
note = "Proceedings of ITiCSE '08.",
abstract = "Concurrent programming was once the preserve of
experts writing systems internals; but recently the
growing importance of application servers, and the
excellent support in Java and C\# for thread handling,
has brought threads and locking as topics that every
software developer might experience, and therefore
every computer science graduate ought to know. In this
paper we report on several years of experience teaching
this material in the early years of the curriculum. We
focus on one aspect of multi-threaded code, namely how
to write sensible thread-safe classes. We identify the
learning outcomes we aim to deliver, and we discuss the
main pedagogic difficulties students find. We present
some examples that can help students avoid common
erroneous views.",
acknowledgement = ack-nhfb,
fjournal = "SIGCSE Bulletin (ACM Special Interest Group on
Computer Science Education)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688",
author = "S. Fide and S. Jenks",
title = "Proactive Use of Shared {L3} Caches to Enhance Cache
Communications in Multi-Core Processors",
volume = "7",
number = "2",
pages = "57--60",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "The software and hardware techniques to exploit the
potential of multi-core processors are falling behind,
even though the number of cores and cache levels per
chip is increasing rapidly. There is no explicit
communications support available, and hence inter-core
communications depend on cache coherence protocols,
resulting in demand-based cache line transfers with
their inherent latency and overhead. In this paper, we
present software controlled eviction (SCE) to improve
the performance of multithreaded applications running
on multi-core processors by moving shared data to
shared cache levels before it is demanded from remote
private caches. Simulation results show that SCE offers
significant performance improvement (8-28\%) and
reduces L3 cache misses by 88-98\%.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence protocol; cache communication; cache
storage; Concurrent computing; Control systems;
Degradation; Delay; demand-based cache line transfer;
Hardware; intercore communications; microprocessor
chips; Multi-core/single-chip multiprocessors;
multi-threading; Multicore processing; multicore
processors; multithreaded application; Parallel
processing; Protocols; shared L3 cache; shared memory
systems; software controlled eviction; Software
performance; Support for multi-threaded execution",
author = "Cormac Flanagan and Stephen N. Freund",
title = "{Atomizer}: a dynamic atomicity checker for
multithreaded programs",
volume = "71",
number = "2",
pages = "89--109",
day = "1",
month = apr,
year = "2008",
ISSN = "0167-6423 (print), 1872-7964 (electronic)",
ISSN-L = "0167-6423",
bibdate = "Fri Apr 1 18:39:19 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Science of Computer Programming",
journal-URL = "http://www.sciencedirect.com/science/journal/01676423",
author = "Cormac Flanagan and Stephen N. Freund and Marina
Lifshin and Shaz Qadeer",
title = "Types for atomicity: {Static} checking and inference
for {Java}",
journal = j-TOPLAS,
volume = "30",
number = "4",
pages = "20:1--20:52",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1145/1377492.1377495",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Aug 5 19:14:53 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Atomicity is a fundamental correctness property in
multithreaded programs. A method is atomic if, for
every execution, there is an equivalent serial
execution in which the actions of the method are not
interleaved with actions of other threads. Atomic
methods are amenable to sequential reasoning, which
significantly facilitates subsequent analysis and
This article presents a type system for specifying and
verifying the atomicity of methods in multithreaded
Java programs using a synthesis of Lipton's theory of
reduction and type systems for race detection. The type
system supports guarded, write-guarded, and unguarded
fields, as well as thread-local data, parameterized
classes and methods, and protected locks. We also
present an algorithm for verifying atomicity via type
We have applied our type checker and type inference
tools to a number of commonly used Java library classes
and programs. These tools were able to verify the vast
majority of methods in these benchmarks as atomic,
indicating that atomicity is a widespread methodology
for multithreaded programming. In addition, reported
atomicity violations revealed some subtle errors in the
synchronization disciplines of these programs.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "Atomicity; concurrent programs; type inference; type
author = "Cormac Flanagan and Stephen N. Freund and Jaeheon Yi",
title = "{Velodrome}: a sound and complete dynamic atomicity
checker for multithreaded programs",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "293--303",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1375581.1375618",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Atomicity is a fundamental correctness property in
multithreaded programs, both because atomic code blocks
are amenable to sequential reasoning (which
significantly simplifies correctness arguments), and
because atomicity violations often reveal defects in a
program's synchronization structure. Unfortunately, all
atomicity analyses developed to date are incomplete in
that they may yield false alarms on correctly
synchronized programs, which limits their
We present the first dynamic analysis for atomicity
that is both sound and complete. The analysis reasons
about the exact dependencies between operations in the
observed trace of the target program, and it reports
error messages if and only if the observed trace is not
conflict-serializable. Despite this significant
increase in precision, the performance and coverage of
our analysis is competitive with earlier incomplete
dynamic analyses for atomicity.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "atomicity; dynamic analysis; serializability",
author = "Anders Gidenstam and Marina Papatriantafilou",
title = "{LFTHREADS}: a lock-free thread library",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "5",
pages = "88--92",
month = dec,
year = "2008",
DOI = "https://doi.org/10.1145/1556444.1556456",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 26 11:50:56 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This extended abstract presents LFTHREADS, a thread
library entirely based on lock-free methods, i.e. no
spinlocks or similar synchronization mechanisms are
employed in the implementation of the multithreading.
Since lockfreedom is highly desirable in
multiprocessors/multicores due to its advantages in
parallelism, fault-tolerance, convoy-avoidance and
more, there is an increased demand in lock-free methods
in parallel applications, hence also in
multiprocessor/multicore system services. LFTHREADS is
the first thread library that provides a lock-free
implementation of blocking synchronization primitives
for application threads; although the latter may sound
like a contradicting goal, such objects have several
benefits: e.g. library operations that block and
unblock threads on the same synchronization object can
make progress in parallel while maintaining the desired
thread-level semantics and without having to wait for
any 'low' operations among them. Besides, as no
spin-locks or similar synchronization mechanisms are
employed, memory contention can be reduced and
processors/cores are able to do useful work. As a
consequence, applications, too, can enjoy enhanced
parallelism and fault-tolerance. For the
synchronization in LFTHREADS we have introduced a new
method, which we call responsibility hand-off (RHO),
that does not need any special kernel support. The RHO
method is also of independent interest, as it can also
serve as a tool for lock-free token passing, management
of contention and interaction between scheduling and
synchronization. This paper gives an outline and the
context of LFTHREADS. For more details the reader is
referred to [7] and [8].",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "George A. Gravvanis and Victor N. Epitropou",
title = "{Java} multithreading-based parallel approximate
arrow-type inverses",
journal = j-CCPE,
volume = "20",
number = "10",
pages = "1151--1172",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1002/cpe.1262",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:25 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "18 Sep 2007",
author = "Wessam M. Hassanein and Layali K. Rashid and Moustafa
A. Hammad",
title = "Analyzing the Effects of Hyperthreading on the
Performance of Data Management Systems",
journal = j-INT-J-PARALLEL-PROG,
volume = "36",
number = "2",
pages = "206--225",
month = apr,
year = "2008",
DOI = "https://doi.org/10.1007/s10766-007-0066-x",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:07:03 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=206",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Data management systems; Databases; Hyper-threaded
architectures; Performance; Simultaneous
author = "Bingsheng He and Qiong Luo",
title = "Cache-oblivious databases: {Limitations} and
journal = j-TODS,
volume = "33",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1366102.1366105",
ISSN = "0362-5915 (print), 1557-4644 (electronic)",
ISSN-L = "0362-5915",
bibdate = "Wed Jun 25 08:39:17 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tods/;
abstract = "Cache-oblivious techniques, proposed in the theory
community, have optimal asymptotic bounds on the amount
of data transferred between any two adjacent levels of
an arbitrary memory hierarchy. Moreover, this optimal
performance is achieved without any hardware platform
specific tuning. These properties are highly attractive
to autonomous databases, especially because the
hardware architectures are becoming increasingly
complex and diverse.\par
In this article, we present our design, implementation,
and evaluation of the first cache-oblivious in-memory
query processor, EaseDB. Moreover, we discuss the
inherent limitations of the cache-oblivious approach as
well as the opportunities given by the upcoming
hardware architectures. Specifically, a cache-oblivious
technique usually requires sophisticated algorithm
design to achieve a comparable performance to its
cache-conscious counterpart. Nevertheless, this
development-time effort is compensated by the
automaticity of performance achievement and the reduced
ownership cost. Furthermore, this automaticity enables
cache-oblivious techniques to outperform their
cache-conscious counterparts in multi-threading
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Database Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J777",
keywords = "cache-conscious; cache-oblivious; chip
multiprocessors; data caches; simultaneous
author = "Bart Jacobs and Frank Piessens and Jan Smans and K.
Rustan M. Leino and Wolfram Schulte",
title = "A programming model for concurrent object-oriented
journal = j-TOPLAS,
volume = "31",
number = "1",
pages = "1:1--1:48",
month = dec,
year = "2008",
DOI = "https://doi.org/10.1145/1452044.1452045",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Dec 23 11:52:52 MST 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Reasoning about multithreaded object-oriented programs
is difficult, due to the nonlocal nature of object
aliasing and data races. We propose a programming
regime (or {\em programming model\/}) that rules out
data races, and enables local reasoning in the presence
of object aliasing and concurrency. Our programming
model builds on the multithreading and synchronization
primitives as they are present in current mainstream
programming languages. Java or C\\# programs developed
according to our model can be annotated by means of
stylized comments to make the use of the model
explicit. We show that such annotated programs can be
formally verified to comply with the programming model.
If the annotated program verifies, the underlying Java
or C\\# program is guaranteed to be free from data
races, and it is sound to reason locally about program
behavior. Verification is modular: a program is valid
if all methods are valid, and validity of a method does
not depend on program elements that are not visible to
the method. We have implemented a verifier for programs
developed according to our model in a custom build of
the Spec\\# programming system, and we have validated
our approach on a case study.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "Aliasing; data races; local reasoning; modular
reasoning; ownership; verification condition
author = "Pascal Jaisson and Florian {De Vuyst}",
title = "An innovating {PDE} model based on fluid flow paradigm
for multithread systems",
volume = "52",
number = "18",
pages = "3318--3324",
day = "22",
month = dec,
year = "2008",
CODEN = "????",
ISSN = "1389-1286 (print), 1872-7069 (electronic)",
ISSN-L = "1389-1286",
bibdate = "Sat Apr 2 08:42:29 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Computer Networks (Amsterdam, Netherlands: 1999)",
journal-URL = "http://www.sciencedirect.com/science/journal/13891286",
author = "Dongsoo Kang and Chen Liu and Jean-Luc Gaudiot",
title = "The Impact of Speculative Execution on {SMT}
journal = j-INT-J-PARALLEL-PROG,
volume = "36",
number = "4",
pages = "361--385",
month = aug,
year = "2008",
DOI = "https://doi.org/10.1007/s10766-007-0052-3",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:07:14 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=4;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=4&spage=361",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Confidence estimator; Simultaneous multithreading;
Speculation control; Thread scheduling",
author = "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve
Reinhardt and Krisztian Flautner and Trevor Mudge",
title = "{PicoServer}: {Using} {$3$D} stacking technology to
build energy efficient servers",
journal = j-JETC,
volume = "4",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412589",
ISSN = "1550-4832",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
abstract = "This article extends our prior work to show that a
straightforward use of 3D stacking technology enables
the design of compact energy-efficient servers. Our
proposed architecture, called PicoServer, employs 3D
technology to bond one die containing several simple,
slow processing cores to multiple memory dies
sufficient for a primary memory. The multiple memory
dies are composed of DRAM. This use of 3D stacks
readily facilitates wide low-latency buses between
processors and memory. These remove the need for an L2
cache allowing its area to be re-allocated to
additional simple cores. The additional cores allow the
clock frequency to be lowered without impairing
throughput. Lower clock frequency means that thermal
constraints, a concern with 3D stacking, are easily
satisfied. We extend our original analysis on
PicoServer to include: (1) a wider set of server
workloads, (2) the impact of multithreading, and (3)
the on-chip DRAM architecture and system memory usage.
PicoServer is intentionally simple, requiring only the
simplest form of 3D technology where die are stacked on
top of one another. Our intent is to minimize risk of
introducing a new technology (3D) to implement a class
of low-cost, low-power compact server architectures.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D stacking technology; chip multiprocessor;
full-system simulation; Low power; Tier-1/2/3 server",
author = "Ronny Krashinsky and Christopher Batten and Krste
title = "Implementing the {Scale} vector-thread processor",
journal = j-TODAES,
volume = "13",
number = "3",
pages = "41:1--41:??",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1145/1367045.1367050",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Tue Aug 5 18:41:27 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "The Scale vector-thread processor is a
complexity-effective solution for embedded computing
which flexibly supports both vector and highly
multithreaded processing. The 7.1-million transistor
chip has 16 decoupled execution clusters, vector load
and store units, and a nonblocking 32KB cache. An
automated and iterative design and verification flow
enabled a performance-, power-, and area-efficient
implementation with two person-years of development
effort. Scale has a core area of 16.6 mm$^2$ in 180 nm
technology, and it consumes 400 mW--1.1 W while running
at 260 MHz.",
acknowledgement = ack-nhfb,
articleno = "41",
fjournal = "ACM Transactions on Design Automation of Electronic
Systems (TODAES)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
keywords = "hybrid C++/Verilog simulation; iterative VLSI design
flow; multithreaded processors; procedural datapath
pre-placement; vector processors; vector-thread
author = "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy
and Yen-Kuang Chen and Jatin Chhugani and Christopher
J. Hughes and Changkyu Kim and Victor W. Lee and
Anthony D. Nguyen",
title = "Atomic Vector Operations on Chip Multiprocessors",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "3",
pages = "441--452",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1394608.1382154",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Aug 6 08:35:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The current trend is for processors to deliver
dramatic improvements in parallel performance while
only modestly improving serial performance. Parallel
performance is harvested through vector/SIMD
instructions as well as multithreading (through both
multithreaded cores and chip multiprocessors). Vector
parallelism can be more efficiently supported than
multithreading, but is often harder for software to
exploit. In particular, code with sparse data access
patterns cannot easily utilize the vector/SIMD
instructions of mainstream processors. Hardware to
scatter and gather sparse data has previously been
proposed to enable vector execution for these codes.
However, on multithreaded architectures, a number of
applications spend significant time on atomic
operations (e.g., parallel reductions), which cannot be
vectorized using previously proposed schemes. This
paper proposes architectural support for atomic vector
operations (referred to as GLSC) that addresses this
limitation. GLSC extends scatter-gather hardware to
support atomic memory operations. Our experiments show
that the GLSC provides an average performance
improvement on a set of important RMS kernels of 54\%
for 4-wide SIMD.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "locks; multiprocessors; reductions; SIMD; vector",
author = "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
title = "Transaction-Aware Network-on-Chip Resource
volume = "7",
number = "2",
pages = "53--56",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Performance and scalability are critically-important
for on-chip interconnect in many-core
chip-multiprocessor systems. Packet-switched
interconnect fabric, widely viewed as the de facto
on-chip data communication backplane in the many-core
era, offers high throughput and excellent scalability.
However, these benefits come at the price of router
latency due to run-time multi-hop data buffering and
resource arbitration. The network accounts for a
majority of on-chip data transaction latency. In this
work, we propose dynamic in-network resource
reservation techniques to optimize run-time on-chip
data transactions. This idea is motivated by the need
to preserve existing abstraction and general-purpose
network performance while optimizing for
frequently-occurring network events such as data
transactions. Experimental studies using multithreaded
benchmarks demonstrate that the proposed techniques can
reduce on-chip data access latency by 28.4\% on average
in a 16-node system and 29.2\% on average in a 36-node
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Backplanes; buffer storage; Computer buffers; data
communication; Data communication; de facto on-chip
data communication backplane; Delay; dynamic in-network
resource reservation techniques; Fabrics;
frequently-occurring network events; Interconnection
architectures; Interconnections (Subsystems); many-core
chip-multiprocessor systems; multiprocessor
interconnection networks; Network-on-a-chip; on-chip
data transaction latency; On-chip interconnection
networks; packet switching; packet-switched
interconnect fabric; Parallel Architectures; resource
allocation; router latency; run-time multihop data
buffering; Runtime; Scalability; System-on-a-chip;
telecommunication network routing; Throughput;
transaction-aware network-on-chip resource
author = "Duo Liu and Zheng Chen and Bei Hua and Nenghai Yu and
Xinan Tang",
title = "High-performance packet classification algorithm for
multithreaded {IXP} network processor",
journal = j-TECS,
volume = "7",
number = "2",
pages = "16:1--16:??",
month = feb,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1331331.1331340",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jun 12 15:22:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Packet classification is crucial for the Internet to
provide more value-added services and guaranteed
quality of service. Besides hardware-based solutions,
many software-based classification algorithms have been
proposed. However, classifying at 10 Gbps speed or
higher is a challenging problem and it is still one of
the performance bottlenecks in core routers. In
general, classification algorithms face the same
challenge of balancing between high classification
speed and low memory requirements. This paper proposes
a modified recursive flow classification (RFC)
algorithm, Bitmap-RFC, which significantly reduces the
memory requirements of RFC by applying a bitmap
compression technique. To speed up classifying speed,
we exploit the multithreaded architectural features in
various algorithm development stages from algorithm
design to algorithm implementation. As a result,
Bitmap-RFC strikes a good balance between speed and
space. It can significantly keep both high
classification speed and reduce memory space
consumption. This paper investigates the main NPU
software design aspects that have dramatic performance
impacts on any NPU-based implementations: memory space
reduction, instruction selection, data allocation, task
partitioning, and latency hiding. We experiment with an
architecture-aware design principle to guarantee the
high performance of the classification algorithm on an
NPU implementation. The experimental results show that
the Bitmap-RFC algorithm achieves 10 Gbps speed or
higher and has a good scalability on Intel IXP2800
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
keywords = "architecture; embedded system design; multithreading;
network processor; packet classification; thread-level
author = "Carlos Madriles and Carlos Garc{\'\i}a-Qui{\~n}ones
and Jes{\'u}s S{\'a}nchez and Pedro Marcuello and
Antonio Gonz{\'a}lez and Dean M. Tullsen and Hong Wang
and John P. Shen",
title = "{Mitosis}: a Speculative Multithreaded Processor Based
on Precomputation Slices",
volume = "19",
number = "7",
pages = "914--925",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1109/TPDS.2007.70797",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Jul 3 12:41:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Pablo Montesinos and Luis Ceze and Josep Torrellas",
title = "{DeLorean}: Recording and Deterministically Replaying
Shared-Memory Multiprocessor Execution Efficiently",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "3",
pages = "289--300",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1109/ISCA.2008.36",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Aug 6 08:35:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Support for deterministic replay of multithreaded
execution can greatly help in finding concurrency bugs.
For highest effectiveness, replay schemes should (i)
record at production-run speed, (ii) keep their logging
requirements minute, and (iii) replay at a speed
similar to that of the initial execution. In this
paper, we propose a new substrate for deterministic
replay that provides substantial advances along these
axes. In our proposal, processors execute blocks of
instructions atomically, as in transactional memory or
speculative multithreading, and the system only needs
to record the commit order of these blocks. We call our
scheme DeLorean. Our results show that DeLorean records
execution at a speed similar to that of Release
Consistency (RC) execution and replays at about 82\% of
its speed. In contrast, most current schemes only
record at the speed of Sequential Consistency (SC)
execution. Moreover, DeLorean only needs 7.5\% of the
log size needed by a state-of-the-art scheme. Finally,
DeLorean can be configured to need only 0.6\% of the
log size of the state-of-the-art scheme at the cost of
recording at 86\% of RC's execution speed --- still
faster than SC. In this configuration, the log of an
8-processor 5-GHz machine is estimated to be only about
20GB per day.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Madanlal Musuvathi and Shaz Qadeer",
title = "Fair stateless model checking",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "362--371",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1379022.1375625",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Stateless model checking is a useful state-space
exploration technique for systematically testing
complex real-world software. Existing stateless model
checkers are limited to the verification of safety
properties on terminating programs. However, realistic
concurrent programs are nonterminating, a property that
significantly reduces the efficacy of stateless model
checking in testing them. Moreover, existing stateless
model checkers are unable to verify that a
nonterminating program satisfies the important liveness
property of livelock-freedom, a property that requires
the program to make continuous progress for any
To address these shortcomings, this paper argues for
incorporating a fair scheduler in stateless
exploration. The key contribution of this paper is an
explicit scheduler that is (strongly) fair and at the
same time sufficiently nondeterministic to guarantee
full coverage of safety properties. We have implemented
the fair scheduler in the CHESS model checker. We show
through theoretical arguments and empirical evaluation
that our algorithm satisfies two important properties:
(1) it visits all states of a finite-state program
achieving state coverage at a faster rate than existing
techniques, and (2) it finds all livelocks in a
finite-state program. Before this work, nonterminating
programs had to be manually modified in order to apply
CHESS to them. The addition of fairness has allowed
CHESS to be effectively applied to real-world
nonterminating programs without any modification. For
example, we have successfully booted the Singularity
operating system under the control of CHESS.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; fairness; liveness; model checking;
multi-threading; shared-memory programs; software
author = "Iulian Neamtiu and Michael Hicks and Jeffrey S. Foster
and Polyvios Pratikakis",
title = "Contextual effects for version-consistent dynamic
software updating all and safe concurrent programming",
journal = j-SIGPLAN,
volume = "43",
number = "1",
pages = "37--49",
month = jan,
year = "2008",
DOI = "https://doi.org/10.1145/1328897.1328447",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:02:13 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper presents a generalization of standard
effect systems that we call contextual effects. A
traditional effect system computes the effect of an
expression e. Our system additionally computes the
effects of the computational context in which $e$
occurs. More specifically, we compute the effect of the
computation that has already occurred(the prior effect)
and the effect of the computation yet to take place
(the future effect).\par
Contextual effects are useful when the past or future
computation of the program is relevant at various
program points. We present two substantial examples.
First, we show how prior and future effects can be used
to enforce transactional version consistency (TVC), a
novel correctness property for dynamic software
updates. TV Censures that programmer-designated
transactional code blocks appear to execute entirely at
the same code version, even if a dynamic update occurs
in the middle of the block. Second, we show how future
effects can be used in the analysis of multi-threaded
programs to find thread-shared locations. This is an
essential step in applications such as data race
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "computation effects; contextual effects; data race
detection; dynamic software updating; type and effect
systems; version consistency",
author = "Guilherme Ottoni and David I. August",
title = "Communication optimizations for global multi-threaded
instruction scheduling",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "1",
pages = "222--232",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346310",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:51:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The recent shift in the industry towards chip
multiprocessor (CMP) designs has brought the need for
multi-threaded applications to mainstream computing. As
observed in several limit studies, most of the
parallelization opportunities require looking for
parallelism beyond local regions of code. To exploit
these opportunities, especially for sequential
applications, researchers have recently proposed global
multi-threaded instruction scheduling techniques,
including DSWP and GREMIO. These techniques
simultaneously schedule instructions from large regions
of code, such as arbitrary loop nests or whole
procedures, and have been shown to be effective at
extracting threads for many applications. A key enabler
of these global instruction scheduling techniques is
the Multi-Threaded Code Generation (MTCG) algorithm
proposed in [16], which generates multi-threaded code
for any partition of the instructions into threads.
This algorithm inserts communication and
synchronization instructions in order to satisfy all
inter-thread dependences.\par
In this paper, we present a general compiler framework,
COCO, to optimize the communication and synchronization
instructions inserted by the MTCG algorithm. This
framework, based on thread-aware data-flow analyses and
graph min-cut algorithms, appropriately models and
optimizes all kinds of inter-thread dependences,
including register, memory, and control dependences.
Our experiments, using a fully automatic compiler
implementation of these techniques, demonstrate
significant reductions (about 30\% on average) in the
number of dynamic communication instructions in code
parallelized with DSWP and GREMIO. This reduction in
communication translates to performance gains of up to
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "communication; data-flow analysis; graph min-cut;
instruction scheduling; multi-threading;
author = "Guilherme Ottoni and David I. August",
title = "Communication optimizations for global multi-threaded
instruction scheduling",
journal = j-OPER-SYS-REV,
volume = "42",
number = "2",
pages = "222--232",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346310",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:20:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The recent shift in the industry towards chip
multiprocessor (CMP) designs has brought the need for
multi-threaded applications to mainstream computing. As
observed in several limit studies, most of the
parallelization opportunities require looking for
parallelism beyond local regions of code. To exploit
these opportunities, especially for sequential
applications, researchers have recently proposed global
multi-threaded instruction scheduling techniques,
including DSWP and GREMIO. These techniques
simultaneously schedule instructions from large regions
of code, such as arbitrary loop nests or whole
procedures, and have been shown to be effective at
extracting threads for many applications. A key enabler
of these global instruction scheduling techniques is
the Multi-Threaded Code Generation (MTCG) algorithm
proposed in [16], which generates multi-threaded code
for any partition of the instructions into threads.
This algorithm inserts communication and
synchronization instructions in order to satisfy all
inter-thread dependences.\par
In this paper, we present a general compiler framework,
COCO, to optimize the communication and synchronization
instructions inserted by the MTCG algorithm. This
framework, based on thread-aware data-flow analyses and
graph min-cut algorithms, appropriately models and
optimizes all kinds of inter-thread dependences,
including register, memory, and control dependences.
Our experiments, using a fully automatic compiler
implementation of these techniques, demonstrate
significant reductions (about 30\% on average) in the
number of dynamic communication instructions in code
parallelized with DSWP and GREMIO. This reduction in
communication translates to performance gains of up to
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "communication; data-flow analysis; graph min-cut;
instruction scheduling; multi-threading;
author = "Guilherme Ottoni and David I. August",
title = "Communication optimizations for global multi-threaded
instruction scheduling",
journal = j-SIGPLAN,
volume = "43",
number = "3",
pages = "222--232",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346310",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:03:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The recent shift in the industry towards chip
multiprocessor (CMP) designs has brought the need for
multi-threaded applications to mainstream computing. As
observed in several limit studies, most of the
parallelization opportunities require looking for
parallelism beyond local regions of code. To exploit
these opportunities, especially for sequential
applications, researchers have recently proposed global
multi-threaded instruction scheduling techniques,
including DSWP and GREMIO. These techniques
simultaneously schedule instructions from large regions
of code, such as arbitrary loop nests or whole
procedures, and have been shown to be effective at
extracting threads for many applications. A key enabler
of these global instruction scheduling techniques is
the Multi-Threaded Code Generation (MTCG) algorithm
proposed in [16], which generates multi-threaded code
for any partition of the instructions into threads.
This algorithm inserts communication and
synchronization instructions in order to satisfy all
inter-thread dependences.\par
In this paper, we present a general compiler framework,
COCO, to optimize the communication and synchronization
instructions inserted by the MTCG algorithm. This
framework, based on thread-aware data-flow analyses and
graph min-cut algorithms, appropriately models and
optimizes all kinds of inter-thread dependences,
including register, memory, and control dependences.
Our experiments, using a fully automatic compiler
implementation of these techniques, demonstrate
significant reductions (about 30\% on average) in the
number of dynamic communication instructions in code
parallelized with DSWP and GREMIO. This reduction in
communication translates to performance gains of up to
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "communication; data-flow analysis; graph min-cut;
instruction scheduling; multi-threading;
author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
and David I. August",
title = "Performance scalability of decoupled software
journal = j-TACO,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400113",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Any successful solution to using multicore processors
to scale general-purpose program performance will have
to contend with rising intercore communication costs
while exposing coarse-grained parallelism. Recently
proposed pipelined multithreading (PMT) techniques have
been demonstrated to have general-purpose applicability
and are also able to effectively tolerate inter-core
latencies through pipelined interthread communication.
These desirable properties make PMT techniques strong
candidates for program parallelization on current and
future multicore processors and understanding their
performance characteristics is critical to their
deployment. To that end, this paper evaluates the
performance scalability of a general-purpose PMT
technique called decoupled software pipelining (DSWP)
and presents a thorough analysis of the communication
bottlenecks that must be overcome for optimal DSWP
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "decoupled software pipelining; performance analysis",
author = "Peter A. Rounce and Alberto F. De Souza",
title = "Dynamic Instruction Scheduling in a Trace-based
Multi-threaded Architecture",
journal = j-INT-J-PARALLEL-PROG,
volume = "36",
number = "2",
pages = "184--205",
month = apr,
year = "2008",
DOI = "https://doi.org/10.1007/s10766-007-0062-1",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:07:03 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=184",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Dynamic instruction scheduling; Simultaneous
multi-threading; VLIW; Wide issue architectures",
author = "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John
M. Tracey",
title = "Do commodity {SMT} processors need more {OS}
journal = j-OPER-SYS-REV,
volume = "42",
number = "1",
pages = "21--25",
month = jan,
year = "2008",
DOI = "https://doi.org/10.1145/1341312.1341318",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:19:29 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The availability of Simultaneous Multithreading (SMT)
in commodity processors such as the Pentium 4 (P4) has
raised interest among OS researchers. While earlier
simulation studies of SMT suggested exciting
performance potential, observed improvement on the P4
has been much more restrained, raising the hope that OS
research can help bridge the gap. We argue that OS
research for current commodity Simultaneous
Multithreading (SMT) processors is unlikely to yield
significant benefits. In general, we find that SMT
processor simulations were optimistic about cache and
memory performance characteristics, while overlooking
the OS overheads of SMT kernels versus uniprocessor
kernels. Using measurement and analysis on actual
hardware, we find that little opportunity exists for
realistic performance gains on commodity SMT beyond
what is currently achieved.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Kevin Schaffer and Robert A. Walker",
title = "Using Hardware Multithreading to Overcome
Broadcast\slash Reduction Latency in an Associative
{SIMD} Processor",
volume = "18",
number = "4",
pages = "491--509",
month = dec,
year = "2008",
DOI = "https://doi.org/10.1142/S0129626408003533",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
bibdate = "Thu Sep 2 09:08:11 MDT 2010",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Koushik Sen",
title = "Race directed random testing of concurrent programs",
journal = j-SIGPLAN,
volume = "43",
number = "6",
pages = "11--21",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1379022.1375584",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:04:53 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Bugs in multi-threaded programs often arise due to
data races. Numerous static and dynamic program
analysis techniques have been proposed to detect data
races. We propose a novel randomized dynamic analysis
technique that utilizes potential data race information
obtained from an existing analysis tool to separate
real races from false races without any need for manual
inspection. Specifically, we use potential data race
information obtained from an existing dynamic analysis
technique to control a random scheduler of threads so
that real race conditions get created with very high
probability and those races get resolved randomly at
runtime. Our approach has several advantages over
existing dynamic analysis tools. First, we can create a
real race condition and resolve the race randomly to
see if an error can occur due to the race. Second, we
can replay a race revealing execution efficiently by
simply using the same seed for random number
generation--we do not need to record the execution.
Third, our approach has very low overhead compared to
other precise dynamic race detection techniques because
we only track all synchronization operations and a
single pair of memory access statements that are
reported to be in a potential race by an existing
analysis. We have implemented the technique in a
prototype tool for Java and have experimented on a
number of large multi-threaded Java programs. We report
a number of previously known and unknown bugs and real
races in these Java programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; dynamic analysis; race detection; random
author = "Joseph J. Sharkey and Jason Loew and Dmitry V.
title = "Reducing register pressure in {SMT} processors through
{L2}-miss-driven early register release",
journal = j-TACO,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The register file is one of the most critical datapath
components limiting the number of threads that can be
supported on a simultaneous multithreading (SMT)
processor. To allow the use of smaller register files
without degrading performance, techniques that maximize
the efficiency of using registers through aggressive
register allocation/deallocation can be considered. In
this article, we propose a novel technique to early
deallocate physical registers allocated to threads
which experience L2 cache misses. This is accomplished
by speculatively committing the load-independent
instructions and deallocating the registers
corresponding to the previous mappings of their
destinations, without waiting for the cache miss
request to be serviced. The early deallocated registers
are then made immediately available for allocation to
instructions within the same thread as well as within
other threads, thus improving the overall processor
throughput. On the average across the simulated mixes
of multiprogrammed SPEC 2000 workloads, our technique
results in 33\% improvement in throughput and 25\%
improvement in terms of harmonic mean of weighted IPCs
over the baseline SMT with the state-of-the-art DCRA
policy. This is achieved without creating checkpoints,
maintaining per-register counters of pending consumers,
performing tag rebroadcasts, register remappings,
and/or additional associative searches.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "register file; Simultaneous multithreading",
author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
title = "Feedback-driven threading: power-efficient and
high-performance execution of multi-threaded workloads
on {CMPs}",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "1",
pages = "277--286",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1346281.1346317",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:51:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Extracting high-performance from the emerging Chip
Multiprocessors (CMPs) requires that the application be
divided into multiple threads. Each thread executes on
a separate core thereby increasing concurrency and
improving performance. As the number of cores on a CMP
continues to increase, the performance of some
multi-threaded applications will benefit from the
increased number of threads, whereas, the performance
of other multi-threaded applications will become
limited by data-synchronization and off-chip bandwidth.
For applications that get limited by
data-synchronization, increasing the number of threads
significantly degrades performance and increases
on-chip power. Similarly, for applications that get
limited by off-chip bandwidth, increasing the number of
threads increases on-chip power without providing any
performance improvement. Furthermore, whether an
application gets limited by data-synchronization, or
bandwidth, or neither depends not only on the
application but also on the input set and the machine
configuration. Therefore, controlling the number of
threads based on the run-time behavior of the
application can significantly improve performance and
reduce power.\par
This paper proposes Feedback-Driven Threading (FDT), a
framework to dynamically control the number of threads
using run-time information. FDT can be used to
implement Synchronization-Aware Threading (SAT), which
predicts the optimal number of threads depending on the
amount of data-synchronization. Our evaluation shows
that SAT can reduce both execution time and power by up
to 66\% and 78\% respectively. Similarly, FDT can be
used to implement Bandwidth-Aware Threading (BAT),
which predicts the minimum number of threads required
to saturate the off-chip bus. Our evaluation shows that
BAT reduces on-chip power by up to 78\%. When SAT and
BAT are combined, the average execution time reduces by
17\% and power reduces by 59\%. The proposed techniques
leverage existing performance counters and require
minimal support from the threading library.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "bandwidth; CMP; multi-threaded; synchronization",
author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
title = "Feedback-driven threading: power-efficient and
high-performance execution of multi-threaded workloads
on {CMPs}",
journal = j-OPER-SYS-REV,
volume = "42",
number = "2",
pages = "277--286",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1346281.1346317",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:20:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Extracting high-performance from the emerging Chip
Multiprocessors (CMPs) requires that the application be
divided into multiple threads. Each thread executes on
a separate core thereby increasing concurrency and
improving performance. As the number of cores on a CMP
continues to increase, the performance of some
multi-threaded applications will benefit from the
increased number of threads, whereas, the performance
of other multi-threaded applications will become
limited by data-synchronization and off-chip bandwidth.
For applications that get limited by
data-synchronization, increasing the number of threads
significantly degrades performance and increases
on-chip power. Similarly, for applications that get
limited by off-chip bandwidth, increasing the number of
threads increases on-chip power without providing any
performance improvement. Furthermore, whether an
application gets limited by data-synchronization, or
bandwidth, or neither depends not only on the
application but also on the input set and the machine
configuration. Therefore, controlling the number of
threads based on the run-time behavior of the
application can significantly improve performance and
reduce power.\par
This paper proposes Feedback-Driven Threading (FDT), a
framework to dynamically control the number of threads
using run-time information. FDT can be used to
implement Synchronization-Aware Threading (SAT), which
predicts the optimal number of threads depending on the
amount of data-synchronization. Our evaluation shows
that SAT can reduce both execution time and power by up
to 66\% and 78\% respectively. Similarly, FDT can be
used to implement Bandwidth-Aware Threading (BAT),
which predicts the minimum number of threads required
to saturate the off-chip bus. Our evaluation shows that
BAT reduces on-chip power by up to 78\%. When SAT and
BAT are combined, the average execution time reduces by
17\% and power reduces by 59\%. The proposed techniques
leverage existing performance counters and require
minimal support from the threading library.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "bandwidth; CMP; multi-threaded; synchronization",
author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
title = "Feedback-driven threading: power-efficient and
high-performance execution of multi-threaded workloads
on {CMPs}",
journal = j-SIGPLAN,
volume = "43",
number = "3",
pages = "277--286",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1346281.1346317",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:03:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Extracting high-performance from the emerging Chip
Multiprocessors (CMPs) requires that the application be
divided into multiple threads. Each thread executes on
a separate core thereby increasing concurrency and
improving performance. As the number of cores on a CMP
continues to increase, the performance of some
multi-threaded applications will benefit from the
increased number of threads, whereas, the performance
of other multi-threaded applications will become
limited by data-synchronization and off-chip bandwidth.
For applications that get limited by
data-synchronization, increasing the number of threads
significantly degrades performance and increases
on-chip power. Similarly, for applications that get
limited by off-chip bandwidth, increasing the number of
threads increases on-chip power without providing any
performance improvement. Furthermore, whether an
application gets limited by data-synchronization, or
bandwidth, or neither depends not only on the
application but also on the input set and the machine
configuration. Therefore, controlling the number of
threads based on the run-time behavior of the
application can significantly improve performance and
reduce power.\par
This paper proposes Feedback-Driven Threading (FDT), a
framework to dynamically control the number of threads
using run-time information. FDT can be used to
implement Synchronization-Aware Threading (SAT), which
predicts the optimal number of threads depending on the
amount of data-synchronization. Our evaluation shows
that SAT can reduce both execution time and power by up
to 66\% and 78\% respectively. Similarly, FDT can be
used to implement Bandwidth-Aware Threading (BAT),
which predicts the minimum number of threads required
to saturate the off-chip bus. Our evaluation shows that
BAT reduces on-chip power by up to 78\%. When SAT and
BAT are combined, the average execution time reduces by
17\% and power reduces by 59\%. The proposed techniques
leverage existing performance counters and require
minimal support from the threading library.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "bandwidth; CMP; multi-threaded; synchronization",
author = "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo
Monchiero and Jay B. Brockman and Norman P. Jouppi",
title = "A Comprehensive Memory Modeling Tool and Its
Application to the Design and Analysis of Future Memory
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "3",
pages = "51--62",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1109/ISCA.2008.16",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Aug 6 08:35:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper we introduce CACTI-D, a significant
enhancement of CACTI 5.0. CACTI-D adds support for
modeling of commodity DRAM technology and support for
main memory DRAM chip organization. CACTI-D enables
modeling of the complete memory hierarchy with
consistent models all the way from SRAM based L1 caches
through main memory DRAMs on DIMMs. We illustrate the
potential applicability of CACTI-D in the design and
analysis of future memory hierarchies by carrying out a
last level cache study for a multicore multithreaded
architecture at the 32nm technology node. In this study
we use CACTI-D to model all components of the memory
hierarchy including L1, L2, last level SRAM, logic
process based DRAM or commodity DRAM L3 caches, and
main memory DRAM chips. We carry out architectural
simulation using benchmarks with large data sets and
present results of their execution time, breakdown of
power in the memory hierarchy, and system energy-delay
product for the different system configurations. We
find that commodity DRAM technology is most attractive
for stacked last level caches, with significantly lower
energy-delay products.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "cache; CACTI; commodity DRAM; LLC; logic-process based
author = "Dana Vantrease and Robert Schreiber and Matteo
Monchiero and Moray McLaren and Norman P. Jouppi and
Marco Fiorentino and Al Davis and Nathan Binkert and
Raymond G. Beausoleil and Jung Ho Ahn",
title = "{Corona}: System Implications of Emerging Nanophotonic
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "3",
pages = "153--164",
month = jun,
year = "2008",
DOI = "https://doi.org/10.1145/1394608.1382135",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Aug 6 08:35:03 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We expect that many-core microprocessors will push
performance per chip from the 10 gigaflop to the 10
teraflop range in the coming decade. To support this
increased performance, memory and inter-core bandwidths
will also have to scale by orders of magnitude. Pin
limitations, the energy cost of electrical signaling,
and the non-scalability of chip-length global wires are
significant bandwidth impediments. Recent developments
in silicon nanophotonic technology have the potential
to meet these off- and on-stack bandwidth requirements
at acceptable power levels. Corona is a 3D many-core
architecture that uses nanophotonic communication for
both inter-core communication and off-stack
communication to memory or I/O devices. Its peak
floating-point performance is 10 teraflops. Dense
wavelength division multiplexed optically connected
memory modules provide 10 terabyte per second memory
bandwidth. A photonic crossbar fully interconnects its
256 low-power multithreaded cores at 20 terabyte per
second bandwidth. We have simulated a 1024 thread
Corona system running synthetic benchmarks and scaled
versions of the SPLASH-2 benchmark suite. We believe
that in comparison with an electrically-connected
many-core alternative that uses the same on-stack
interconnect power, Corona can provide 2 to 6 times
more performance on many memory intensive workloads,
while simultaneously reducing power.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "3D stacking; many-core CMP; nanophotonics; on-chip
author = "Vasily Volkov and James W. Demmel",
title = "{$ L U $}, {$ Q R $} and {Cholesky} Factorizations
using Vector Capabilities of {GPUs}",
type = "LAPACK Working Note",
number = "202",
institution = inst-UCB-EECS,
address = inst-UCB-EECS:adr,
month = may,
year = "2008",
bibdate = "Fri Apr 24 12:25:43 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.netlib.org/lapack/lawnspdf/lawn202.pdf",
abstract = "We present performance results for dense linear
algebra using the 8-series NVIDIA GPUs. Our
matrix-matrix multiply routine (GEMM) runs 60\% faster
than the vendor implementation in CUBLAS 1.1 and
approaches the peak of hardware capabilities. Our LU,
QR and Cholesky factorizations achieve up to 80--90\%
of the peak GEMM rate. Our parallel LU running on two
GPUs achieves up to $ \approx $300 Gflop/s. These
results are accomplished by challenging the accepted
view of the GPU architecture and programming
guidelines. We argue that modern GPUs should be viewed
as multithreaded multicore vector units. We exploit
blocking similarly to vector computers and
heterogeneity of the system by computing both on GPU
and CPU. This study includes detailed benchmarking of
the GPU memory system that reveals sizes and latencies
of caches and TLB. We present a couple of algorithmic
optimizations aimed at increasing parallelism and
regularity in the problem that provide us with slightly
higher performance.",
acknowledgement = ack-nhfb,
ucbnumber = "UCB/EECS-2008-49",
author = "Kun Wang and Yu Zhang and Huayong Wang and Xiaowei
title = "Parallelization of {IBM Mambo} system simulator in
functional modes",
journal = j-OPER-SYS-REV,
volume = "42",
number = "1",
pages = "71--76",
month = jan,
year = "2008",
DOI = "https://doi.org/10.1145/1341312.1341325",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:19:29 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Mambo [4] is IBM's full-system simulator which models
PowerPC systems, and provides a complete set of
simulation tools to help IBM and its partners in
pre-hardware development and performance evaluation for
future systems. Currently Mambo simulates target
systems on a single host thread. When the number of
cores increases in a target system, Mambo's simulation
performance for each core goes down. As the so-called
`multi-core era' approaches, both target and host
systems will have more and more cores. It is very
important for Mambo to efficiently simulate a
multi-core target system on a multi-core host system.
Parallelization is a natural method to speed up Mambo
under this situation.\par
Parallel Mambo (P-Mambo) is a multi-threaded
implementation of Mambo. Mambo's simulation engine is
implemented as a user-level thread-scheduler. We
propose a multi-scheduler method to adapt Mambo's
simulation engine to multi-threaded execution. Based on
this method a core-based module partition is proposed
to achieve both high inter-scheduler parallelism and
low inter-scheduler dependency. Protection of shared
resources is crucial to both correctness and
performance of P-Mambo. Since there are two tiers of
threads in P-Mambo, protecting shared resources by only
OS-level locks possibly introduces deadlocks due to
user-level context switch. We propose a new lock
mechanism to handle this problem. Since Mambo is an
on-going project with many modules currently under
development, co-existence with new modules is also
important to P-Mambo. We propose a global-lock-based
method to guarantee compatibility of P-Mambo with
future Mambo modules.\par
We have implemented the first version of P-Mambo in
functional modes. The performance of P-Mambo has been
evaluated on the OpenMP implementation of NAS Parallel
Benchmark (NPB) 3.2 [12]. Preliminary experimental
results show that P-Mambo achieves an average speedup
of 3.4 on a 4-core host machine.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "architectural simulation; dynamic binary translation;
parallel simulation",
author = "Fredrik Warg and Per Stenstrom",
title = "Dual-thread Speculation: a Simple Approach to Uncover
Thread-level Parallelism on a Simultaneous
Multithreaded Processor",
journal = j-INT-J-PARALLEL-PROG,
volume = "36",
number = "2",
pages = "166--183",
month = apr,
year = "2008",
DOI = "https://doi.org/10.1007/s10766-007-0064-z",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Jul 9 16:07:03 MDT 2008",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=166",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
keywords = "Chip multiprocessors; Computer architecture;
Simultaneous multithreading; Thread-level parallelism;
Thread-level speculation",
editor = "David L. Weaver",
title = "{OpenSPARC} Internals: {OpenSPARC T1\slash T2} Chip
Multithreaded Throughput Computing",
publisher = "Lulu, Inc.",
address = "860 Aviation Parkway, Suite 300, Morrisville, NC
27560, USA",
pages = "xviii + 369",
year = "2008",
ISBN = "0-557-01974-5",
ISBN-13 = "978-0-557-01974-8",
LCCN = "????",
bibdate = "Tue Nov 11 14:49:47 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/master.bib;
price = "US\$20.00",
URL = "http://www.opensparc.net/publications/books/opensparc-internals.html",
acknowledgement = ack-nhfb,
libnote = "Not yet in my library.",
author = "Michal Wegiel and Chandra Krintz",
title = "The mapping collector: virtual memory support for
generational, parallel, and concurrent compaction",
journal = j-COMP-ARCH-NEWS,
volume = "36",
number = "1",
pages = "91--102",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346294",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jun 17 11:51:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Parallel and concurrent garbage collectors are
increasingly employed by managed runtime environments
(MREs) to maintain scalability, as multi-core
architectures and multi-threaded applications become
pervasive. Moreover, state-of-the-art MREs commonly
implement compaction to eliminate heap fragmentation
and enable fast linear object allocation.\par
Our empirical analysis of object demographics reveals
that unreachable objects in the heap tend to form
clusters large enough to be effectively managed at the
granularity of virtual memory pages. Even though
processes can manipulate the mapping of the virtual
address space through the standard operating system
(OS) interface on most platforms, extant
parallel/concurrent compactors do not do so to exploit
this clustering behavior and instead achieve compaction
by performing, relatively expensive, object moving and
pointer adjustment.\par
We introduce the Mapping Collector (MC), which
leverages virtual memory operations to reclaim and
consolidate free space without moving objects and
updating pointers. MC is a nearly-single-phase
compactor that is simpler and more efficient than
previously reported compactors that comprise two to
four phases. Through effective MRE-OS coordination, MC
maintains the simplicity of a non-moving collector
while providing efficient parallel and concurrent
We implement both stop-the-world and concurrent MC in a
generational garbage collection framework within the
open-source HotSpot Java Virtual Machine. Our
experimental evaluation using a multiprocessor
indicates that MC significantly increases throughput
and scalability as well as reduces pause times,
relative to state-of-the-art, parallel and concurrent
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "compaction; concurrent; parallel; virtual memory",
author = "Michal Wegiel and Chandra Krintz",
title = "The {Mapping Collector}: virtual memory support for
generational, parallel, and concurrent compaction",
journal = j-OPER-SYS-REV,
volume = "42",
number = "2",
pages = "91--102",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346294",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Jun 20 17:20:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Parallel and concurrent garbage collectors are
increasingly employed by managed runtime environments
(MREs) to maintain scalability, as multi-core
architectures and multi-threaded applications become
pervasive. Moreover, state-of-the-art MREs commonly
implement compaction to eliminate heap fragmentation
and enable fast linear object allocation.\par
Our empirical analysis of object demographics reveals
that unreachable objects in the heap tend to form
clusters large enough to be effectively managed at the
granularity of virtual memory pages. Even though
processes can manipulate the mapping of the virtual
address space through the standard operating system
(OS) interface on most platforms, extant
parallel/concurrent compactors do not do so to exploit
this clustering behavior and instead achieve compaction
by performing, relatively expensive, object moving and
pointer adjustment.\par
We introduce the Mapping Collector (MC), which
leverages virtual memory operations to reclaim and
consolidate free space without moving objects and
updating pointers. MC is a nearly-single-phase
compactor that is simpler and more efficient than
previously reported compactors that comprise two to
four phases. Through effective MRE-OS coordination, MC
maintains the simplicity of a non-moving collector
while providing efficient parallel and concurrent
We implement both stop-the-world and concurrent MC in a
generational garbage collection framework within the
open-source HotSpot Java Virtual Machine. Our
experimental evaluation using a multiprocessor
indicates that MC significantly increases throughput
and scalability as well as reduces pause times,
relative to state-of-the-art, parallel and concurrent
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "compaction; concurrent; parallel; virtual memory",
author = "Michal Wegiel and Chandra Krintz",
title = "The mapping collector: virtual memory support for
generational, parallel, and concurrent compaction",
journal = j-SIGPLAN,
volume = "43",
number = "3",
pages = "91--102",
month = mar,
year = "2008",
DOI = "https://doi.org/10.1145/1353535.1346294",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jun 18 11:03:40 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Parallel and concurrent garbage collectors are
increasingly employed by managed runtime environments
(MREs) to maintain scalability, as multi-core
architectures and multi-threaded applications become
pervasive. Moreover, state-of-the-art MREs commonly
implement compaction to eliminate heap fragmentation
and enable fast linear object allocation.\par
Our empirical analysis of object demographics reveals
that unreachable objects in the heap tend to form
clusters large enough to be effectively managed at the
granularity of virtual memory pages. Even though
processes can manipulate the mapping of the virtual
address space through the standard operating system
(OS) interface on most platforms, extant
parallel/concurrent compactors do not do so to exploit
this clustering behavior and instead achieve compaction
by performing, relatively expensive, object moving and
pointer adjustment.\par
We introduce the Mapping Collector (MC), which
leverages virtual memory operations to reclaim and
consolidate free space without moving objects and
updating pointers. MC is a nearly-single-phase
compactor that is simpler and more efficient than
previously reported compactors that comprise two to
four phases. Through effective MRE-OS coordination, MC
maintains the simplicity of a non-moving collector
while providing efficient parallel and concurrent
We implement both stop-the-world and concurrent MC in a
generational garbage collection framework within the
open-source HotSpot Java Virtual Machine. Our
experimental evaluation using a multiprocessor
indicates that MC significantly increases throughput
and scalability as well as reduces pause times,
relative to state-of-the-art, parallel and concurrent
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "compaction; concurrent; parallel; virtual memory",
author = "Jonathan A. Winter and David H. Albonesi",
title = "Addressing thermal nonuniformity in {SMT} workloads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369400",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We explore DTM techniques within the context of
uniform and nonuniform SMT workloads. While DVS is
suitable for addressing workloads with uniformly high
temperatures, for nonuniform workloads, performance
loss occurs because of the slowdown of the cooler
thread. To address this, we propose and evaluate DTM
mechanisms that exploit the steering-based thread
management mechanisms inherent in a clustered SMT
architecture. We show that in contrast to DVS, which
operates globally, our techniques are more effective at
controlling temperature for nonuniform workloads.
Furthermore, we devise a DTM technique that combines
steering and DVS to achieve consistently good
performance across all workloads.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "adaptive microarchitectures; clustered
microarchitectures; dynamic thermal management; dynamic
voltage scaling; simultaneous multithreading",
author = "Chee Siang Wong and Ian Tan and Rosalind Deena Kumari
and Fun Wey",
title = "Towards achieving fairness in the {Linux} scheduler",
journal = j-OPER-SYS-REV,
volume = "42",
number = "5",
pages = "34--43",
month = jul,
year = "2008",
DOI = "https://doi.org/10.1145/1400097.1400102",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Wed Aug 6 16:54:12 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The Operating System scheduler is designed to allocate
the CPU resources appropriately to all processes. The
Linux Completely Fair Scheduler (CFS) design ensures
fairness among tasks using the thread fair scheduling
algorithm. This algorithm ensures allocation of
resources based on the number of threads in the system
and not within executing programs. This can lead to
fairness issue in a multi-threaded environment as the
Linux scheduler tends to favor programs with higher
number of threads. We illustrate the issue of fairness
through experimental evaluation thus exposing the
weakness of the current allocation scheme where
software developers could take advantage by spawning
many additional threads in order to obtain more CPU
resources. A novel algorithm is proposed as a solution
towards achieving better fairness in the Linux
scheduler. The algorithm is based on weight
readjustment of the threads created in the same process
to significantly reduce the unfair allocation of CPU
resources in multi-threaded environments. The algorithm
was implemented and evaluated. It demonstrated
promising results towards solving the raised fairness
issue. We conclude this paper highlighting the
limitations of the proposed approach and the future
work in the stated direction.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "completely fair scheduler; fairness; Linux; process
author = "Feng Xian and Witawas Srisa-an and Hong Jiang",
title = "Contention-aware scheduler: unlocking execution
parallelism in multithreaded {Java} programs",
journal = j-SIGPLAN,
volume = "43",
number = "10",
pages = "163--180",
month = sep,
year = "2008",
DOI = "https://doi.org/10.1145/1449955.1449778",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 22 09:57:37 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In multithreaded programming, locks are frequently
used as a mechanism for synchronization. Because
today's operating systems do not consider lock usage as
a scheduling criterion, scheduling decisions can be
unfavorable to multithreaded applications, leading to
performance issues such as convoying and heavy lock
contention in systems with multiple processors.
Previous efforts to address these issues (e.g.,
transactional memory, lock-free data structure) often
treat scheduling decisions as 'a fact of life,' and
therefore these solutions try to cope with the
consequences of undesirable scheduling instead of
dealing with the problem directly.\par
In this paper, we introduce {\em Contention-Aware
Scheduler (CA-Scheduler)}, which is designed to support
efficient execution of large multithreaded Java
applications in multiprocessor systems. Our proposed
scheduler employs a scheduling policy that reduces lock
contention. As will be shown in this paper, our
prototype implementation of the CA-Scheduler in Linux
and Sun HotSpot virtual machine only incurs 3.5\%
runtime overhead, while the overall performance
differences, when compared with a system with no
contention awareness, range from a degradation of 3\%
in a small multithreaded benchmark to an improvement of
15\% in a large Java application server benchmark.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Java; operating systems; scheduling",
author = "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
and Norman P. Jouppi",
title = "Multicore {DIMM}: an Energy Efficient Memory Module
with Independently Controlled {DRAMs}",
volume = "8",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Demand for memory capacity and bandwidth keeps
increasing rapidly in modern computer systems, and
memory power consumption is becoming a considerable
portion of the system power budget. However, the
current DDR DIMM standard is not well suited to
effectively serve CMP memory requests from both a power
and performance perspective. We propose a new memory
module called a Multicore DIMM, where DRAM chips are
grouped into multiple virtual memory devices, each of
which has its own data path and receives separate
commands (address and control signals). The Multicore
DIMM is designed to improve the energy efficiency of
memory systems with small impact on system performance.
Dividing each memory modules into 4 virtual memory
devices brings a simultaneous 22\%, 7.6\%, and 18\%
improvement in memory power, IPC, and system
energy-delay product respectively on a set of
multithreaded applications and consolidated
acknowledgement = ack-nhfb,
affiliation = "Ahn, JH (Reprint Author), Hewlett Packard Labs,
Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
Mississauga, ON, Canada. Leverich, Jacob, Stanford
Univ, Stanford, CA 94305 USA.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "DRAM; memory module; memory system; Multicore",
number-of-cited-references = "16",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
research-areas = "Computer Science",
researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
times-cited = "26",
unique-id = "Ahn:2009:MDE",
web-of-science-categories = "Computer Science, Hardware \&
author = "Farhana Aleen and Nathan Clark",
title = "Commutativity analysis for software parallelization:
letting program transformations see the big picture",
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "241--252",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508284.1508273",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Extracting performance from many-core architectures
requires software engineers to create multi-threaded
applications, which significantly complicates the
already daunting task of software development. One
solution to this problem is automatic compile-time
parallelization, which can ease the burden on software
developers in many situations. Clearly, automatic
parallelization in its present form is not suitable for
many application domains and new compiler analyses are
needed address its shortcomings.\par
In this paper, we present one such analysis: a new
approach for detecting commutative functions.
Commutative functions are sections of code that can be
executed in any order without affecting the outcome of
the application, e.g., inserting elements into a set.
Previous research on this topic had one significant
limitation, in that the results of a commutative
functions must produce identical memory layouts. This
prevented previous techniques from detecting functions
like malloc, which may return different pointers
depending on the order in which it is called, but these
differing results do not affect the overall output of
the application. Our new commutativity analysis
correctly identify these situations to better
facilitate automatic parallelization. We demonstrate
that this analysis can automatically extract
significant amounts of parallelism from many
applications, and where it is ineffective it can
provide software developers a useful list of functions
that may be commutative provided semantic program
changes that are not automatable.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "automatic software parallelization; commutative
functions; random interpretation",
author = "Satoshi Amamiya and Makoto Amamiya and Ryuzo Hasegawa
and Hiroshi Fujita",
title = "A continuation-based noninterruptible multithreading
processor architecture",
volume = "47",
number = "2",
pages = "228--252",
month = feb,
year = "2009",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Aug 25 08:38:29 MDT 2010",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=47&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=47&issue=2&spage=228",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Zachary R. Anderson and David Gay and Mayur Naik",
title = "Lightweight annotations for controlling sharing in
concurrent data structures",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "98--109",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1542476.1542488",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "SharC is a recently developed system for checking
data-sharing in multithreaded programs. Programmers
specify sharing rules (read-only, protected by a lock,
etc.) for individual objects, and the SharC compiler
enforces these rules using static and dynamic checks.
Violations of these rules indicate unintended data
sharing, which is the underlying cause of harmful
data-races. Additionally, SharC allows programmers to
change the sharing rules for a specific object using a
{\em sharing cast}, to capture the fact that sharing
rules for an object often change during the object's
lifetime. SharC was successfully applied to a number of
multi-threaded C programs.\par
However, many programs are not readily checkable using
SharC because their sharing rules, and changes to
sharing rules, effectively apply to whole data
structures rather than to individual objects. We have
developed a system called {\em Shoal\/} to address this
shortcoming. In addition to the sharing rules and
sharing cast of SharC, our system includes a new
concept that we call {\em groups}. A group is a
collection of objects all having the same sharing mode.
Each group has a distinguished member called the {\em
group leader}. When the sharing mode of the group
leader changes by way of a sharing cast, the sharing
mode of all members of the group also changes. This
operation is made sound by maintaining the invariant
that at the point of a sharing cast, the only external
pointer into the group is the pointer to the group
leader. The addition of groups allows checking safe
concurrency at the level of data structures rather than
at the level of individual objects.\par
We demonstrate the necessity and practicality of groups
by applying Shoal to a wide range of concurrent C
programs (the largest approaching a million lines of
code). In all benchmarks groups entail low annotation
burden and no significant additional performance
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrent programming; data races; multithreaded
author = "Christos D. Antonopoulos and Filip Blagojevic and
Andrey N. Chernikov and Nikos P. Chrisochoides and
Dimitrios S. Nikolopoulos",
title = "Algorithm, software, and hardware optimizations for
{Delaunay} mesh generation on simultaneous
multithreaded architectures",
journal = j-J-PAR-DIST-COMP,
volume = "69",
number = "7",
pages = "601--612",
month = jul,
year = "2009",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Wed Sep 1 16:27:25 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and
Mark Horowitz",
title = "Area-efficiency in {CMP} core design: co-optimization
of microarchitecture and physical design",
journal = j-COMP-ARCH-NEWS,
volume = "37",
number = "2",
pages = "56--65",
month = may,
year = "2009",
DOI = "https://doi.org/10.1145/1577129.1577138",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Aug 11 18:12:39 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we examine the area-performance design
space of a processing core for a chip multiprocessor
(CMP), considering both the architectural design space
and the tradeoffs of the physical design on which the
architecture relies. We first propose a methodology for
performing an integrated optimization of both the
micro-architecture and the physical circuit design of a
microprocessor. In our approach, we use statistical and
convex fitting methods to capture a large
micro-architectural design space. We then characterize
the area-delay tradeoffs of the underlying circuits
through RTL synthesis. Finally, we establish the
relationship between the architecture and the circuits
in an integrative model, which we use to optimize the
processor. As a case study, we apply this methodology
to explore the performance-area tradeoffs in a highly
parallel accelerator architecture for visual computing
applications. Based on some early circuit tradeoff
data, our results indicate that two separate designs
are performance/area optimal for our set of benchmarks:
a simpler single-issue, 2-way multithreaded core
running at high-frequency, and a more aggressively
tuned dual-issue 4-way multithreaded design running at
a lower frequency.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Bruce R. Barkstrom",
title = "On using {Ada} to solve problems in computational
economics and related disciplines with concurrent,
multiagent algorithms",
journal = j-SIGADA-LETTERS,
volume = "29",
number = "3",
pages = "61--72",
month = dec,
year = "2009",
DOI = "https://doi.org/10.1145/1647420.1647437",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Mon Jun 21 14:04:37 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multiagent algorithms are widely used in computational
economics and other social sciences to solve
theoretical and practical problems. Because such
algorithms are inherently concurrent and multithreaded,
Ada's constructs for handling communications between
concurrent processes and avoiding interference between
them make the language very well suited to solving
these problems, particularly given developments in
multi-core CPU chip-making. This paper provides a
concrete example of how Ada assists in solving problems
in computational economics and related disciplines that
work with multiagent systems. Solving a simple problem
illustrates visualizing the agents as Ada tasks, using
UML use cases and synchronization diagrams to design
the communications patterns between agents, and
applying protected objects and functions to avoid
computational indeterminacy.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAda Ada Letters",
keywords = "computational and mathematical organization theory;
computational economics; concurrent programming;
multiagent systems; multithreaded programming",
author = "Christopher Barnes and Pranav Vaidya and Jaehwan John
title = "An {XML}-Based {ADL} Framework for Automatic
Generation of Multithreaded Computer Architecture
volume = "8",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2009",
DOI = "https://doi.org/10.1109/L-CA.2009.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Computer architecture simulation has always played a
pivotal role in continuous innovation of computers.
However, constructing or modifying a high quality
simulator is time consuming and error-prone. Thus,
often Architecture Description Languages (ADLs) are
used to provide an abstraction layer for describing the
computer architecture and automatically generating
corresponding simulators. Along the line of such
research, we present a novel XML-based ADL, its
compiler, and a generation methodology to automatically
generate multithreaded simulators for computer
architecture. We utilize the industry-standard
extensible markup language XML to describe the
functionality and architecture of a modeled processor.
Our ADL framework allows users to easily and quickly
modify the structure, register set, and execution of a
modeled processor. To prove its validity, we have
generated several multithreaded simulators with
different configurations based on the MIPS five-stage
processor, and successfully tested with two programs.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GC",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IUPUI RSFG",
funding-text = "This research was funded by the IUPUI RSFG grant.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C.0.d Modeling of computer architecture; C.1.1.b
Pipeline processors",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Barnes:2009:XBA",
web-of-science-categories = "Computer Science, Hardware \&
author = "Emery D. Berger and Ting Yang and Tongping Liu and
Gene Novark",
title = "{Grace}: safe multithreaded programming for {C\slash
journal = j-SIGPLAN,
volume = "44",
number = "10",
pages = "81--96",
month = oct,
year = "2009",
DOI = "https://doi.org/10.1145/1640089.1640096",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jun 21 18:01:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The shift from single to multiple core architectures
means that programmers must write concurrent,
multithreaded programs in order to increase application
performance. Unfortunately, multithreaded applications
are susceptible to numerous errors, including
deadlocks, race conditions, atomicity violations, and
order violations. These errors are notoriously
difficult for programmers to debug.\par
This paper presents Grace, a software-only runtime
system that eliminates concurrency errors for a class
of multithreaded programs: those based on fork-join
parallelism. By turning threads into processes,
leveraging virtual memory protection, and imposing a
sequential commit protocol, Grace provides programmers
with the appearance of deterministic, sequential
execution, while taking advantage of available
processing cores to run code concurrently and
efficiently. Experimental results demonstrate Grace's
effectiveness: with modest code changes across a suite
of computationally-intensive benchmarks (1-16 lines),
Grace can achieve high scalability and performance
while preventing concurrency errors.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; determinism; deterministic concurrency;
fork-join; sequential semantics",
author = "Robert L. {Bocchino, Jr.} and Vikram S. Adve and Danny
Dig and Sarita V. Adve and Stephen Heumann and Rakesh
Komuravelli and Jeffrey Overbey and Patrick Simmons and
Hyojin Sung and Mohsen Vakilian",
title = "A type and effect system for deterministic parallel
journal = j-SIGPLAN,
volume = "44",
number = "10",
pages = "97--116",
month = oct,
year = "2009",
DOI = "https://doi.org/10.1145/1639949.1640097",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jun 21 18:01:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Today's shared-memory parallel programming models are
complex and error-prone. While many parallel programs
are intended to be deterministic, unanticipated thread
interleavings can lead to subtle bugs and
nondeterministic semantics. In this paper, we
demonstrate that a practical {\em type and effect
system\/} can simplify parallel programming by {\em
guaranteeing deterministic semantics\/} with modular,
compile-time type checking even in a rich, concurrent
object-oriented language such as Java. We describe an
object-oriented type and effect system that provides
several new capabilities over previous systems for
expressing deterministic parallel algorithms. We also
describe a language called Deterministic Parallel Java
(DPJ) that incorporates the new type system features,
and we show that a core subset of DPJ is sound. We
describe an experimental validation showing that DPJ
can express a wide range of realistic parallel
programs; that the new type system features are useful
for such programs; and that the parallel programs
exhibit good performance gains (coming close to or
beating equivalent, nondeterministic multithreaded
programs where those are available).",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "commutativity; determinism; deterministic parallelism;
effect systems; effects",
author = "Stanislav Bratanov and Roman Belenov and Nikita
title = "Virtual machines: a whole new world for performance
journal = j-OPER-SYS-REV,
volume = "43",
number = "2",
pages = "46--55",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1531793.1531802",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Thu Apr 23 19:43:22 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This article addresses a problem of performance
monitoring inside virtual machines (VMs). It advocates
focused monitoring of particular virtualized programs,
explains the need for and the importance of such an
approach to performance monitoring in virtualized
execution environments, and emphasizes its benefits for
virtual machine manufacturers, virtual machine users
(mostly, software developers) and hardware (processor)
manufacturers. The article defines the problem of in-VM
performance monitoring as the ability to employ modern
methods and hardware performance monitoring
capabilities inside virtual machines to an extent
comparable with what is being done in real
environments. Unfortunately, there are numerous reasons
preventing us from achieving such an ambitious goal,
one of those reasons being the lack of support from
virtualization engines; that is why a novel method of
'cooperative' performance data collection is disclosed.
The method implies collection of performance data at
physical hardware and simultaneous tracking of software
states inside a virtual machine. Each statistically
visible execution point of the virtualized software may
then be associated with information on real hardware
events. The method effectively enables time-based
sampling of virtualized workloads combined with
hardware event counting, is applicable to unmodified,
commercially available virtual machines, and has
competitive precision and overhead. The practical
significance and value of the method are further
illustrated by studying a parallel workload and
uncovering virtualization-specific performance issues
of multithreaded programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "hardware performance event counters; virtual
author = "Seungryul Choi and Donald Yeung",
title = "Hill-climbing {SMT} processor resource distribution",
journal = j-TOCS,
volume = "27",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2009",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Fri Feb 13 18:30:25 MST 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "The key to high performance in Simultaneous
MultiThreaded (SMT) processors lies in optimizing the
distribution of shared resources to active threads.
Existing resource distribution techniques optimize
performance only indirectly. They infer potential
performance bottlenecks by observing indicators, like
instruction occupancy or cache miss counts, and take
actions to try to alleviate them. While the corrective
actions are designed to improve performance, their
actual performance impact is not known since end
performance is never monitored. Consequently, potential
performance gains are lost whenever the corrective
actions do not effectively address the actual
bottlenecks occurring in the pipeline.\par
We propose a different approach to SMT resource
distribution that optimizes end performance directly.
Our approach observes the impact that resource
distribution decisions have on performance at runtime,
and feeds this information back to the resource
distribution mechanisms to improve future decisions. By
evaluating many different resource distributions, our
approach tries to learn the best distribution over
time. Because we perform learning online, learning time
is crucial. We develop a hill-climbing algorithm that
quickly learns the best distribution of resources by
following the performance gradient within the resource
distribution space. We also develop several ideal
learning algorithms to enable deeper insights through
limit studies.\par
This article conducts an in-depth investigation of
hill-climbing SMT resource distribution using a
comprehensive suite of 63 multiprogrammed workloads.
Our results show hill-climbing outperforms ICOUNT,
FLUSH, and DCRA (three existing SMT techniques) by
11.4\%, 11.5\%, and 2.8\%, respectively, under the
weighted IPC metric. A limit study conducted using our
ideal learning algorithms shows our approach can
potentially outperform the same techniques by 19.2\%,
18.0\%, and 7.6\%, respectively, thus demonstrating
additional room exists for further improvement. Using
our ideal algorithms, we also identify three
bottlenecks that limit online learning speed: local
maxima, phased behavior, and interepoch jitter. We
define metrics to quantify these learning bottlenecks,
and characterize the extent to which they occur in our
workloads. Finally, we conduct a sensitivity study, and
investigate several extensions to improve our
hill-climbing technique.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
editor = "Thomas H. Cormen and Charles Eric Leiserson and Ronald
L. Rivest and Clifford Stein",
title = "Introduction to algorithms",
publisher = pub-MIT,
address = pub-MIT:adr,
edition = "Third",
pages = "xix + 1292",
year = "2009",
ISBN = "0-262-03384-4 (hardcover), 0-262-53305-7 (paperback)",
ISBN-13 = "978-0-262-03384-8 (hardcover), 978-0-262-53305-8
LCCN = "QA76.6 .C662 2009",
bibdate = "Thu Sep 9 14:42:33 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Some books on algorithms are rigorous but incomplete;
others cover masses of material but lack rigor.
Introduction to Algorithms uniquely combines rigor and
comprehensiveness. The book covers a broad range of
algorithms in depth, yet makes their design and
analysis accessible to all levels of readers. Each
chapter is relatively self-contained and can be used as
a unit of study. The algorithms are described in
English and in a pseudocode designed to be readable by
anyone who has done a little programming. The
explanations have been kept elementary without
sacrificing depth of coverage or mathematical rigor.
The first edition became a widely used text in
universities worldwide as well as the standard
reference for professionals. The second edition
featured new chapters on the role of algorithms,
probabilistic analysis and randomized algorithms, and
linear programming. The third edition has been revised
and updated throughout. It includes two completely new
chapters, on van Emde Boas trees and multithreaded
algorithms, and substantial additions to the chapter on
recurrences (now called ``Divide-and-Conquer''). It
features improved treatment of dynamic programming and
greedy algorithms and a new notion of edge-based flow
in the material on flow networks. Many new exercises
and problems have been added for this edition.",
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
subject = "Computer programming; Computer algorithms",
author = "Andrzej Daniluk",
title = "Multithreaded transactions in scientific computing.
{The} {Growth06\_v2} program",
journal = j-COMP-PHYS-COMM,
volume = "180",
number = "7",
pages = "1219--1220",
month = jul,
year = "2009",
DOI = "https://doi.org/10.1016/j.cpc.2009.01.024",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Mon Feb 13 23:42:43 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465509000393",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "F. S. de Boer",
title = "A shared-variable concurrency analysis of
multi-threaded object-oriented programs",
journal = j-THEOR-COMP-SCI,
volume = "410",
number = "2--3",
pages = "128--141",
day = "6",
month = feb,
year = "2009",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Mon Mar 28 21:21:46 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975",
author = "Aniruddha Desai and Jugdutt Singh",
title = "Architecture Independent Characterization of Embedded
{Java} Workloads",
volume = "8",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "This paper presents architecture independent
characterization of embedded Java workloads based on
the industry standard GrinderBench benchmark which
includes different classes of real world embedded Java
applications. This work is based on a custom built
embedded Java Virtual Machine (JVM) simulator
specifically designed for embedded JVM modeling and
embodies domain specific details such as thread
scheduling, algorithms used for native CLDC APIs and
runtime data structures optimized for use in embedded
systems. The results presented include dynamic
execution characteristics, dynamic bytecode instruction
mix, application and API workload distribution, Object
allocation statistics, instruction-set coverage, memory
usage statistics and method code and stack frame
acknowledgement = ack-nhfb,
affiliation = "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
La Trobe Univ, Bundoora, Vic 3086, Australia.",
author-email = "desai@ieee.org",
da = "2019-06-20",
doc-delivery-number = "V17GC",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; application program
interfaces; architecture independent characterization;
CLDC API; custom built embedded Java virtual machine
simulator; data structures; Data structures; Design
optimization; dynamic bytecode instruction mix; dynamic
execution characteristics; embedded Java workload;
Embedded Systems; embedded systems; Embedded Systems;
industry standard GrinderBench benchmark; instruction
sets; instruction-set coverage; Java; Java bytecode;
Job shop scheduling; JVM; memory usage statistics;
method code characteristics; multi-threading; object
allocation statistics; Runtime; runtime data structure;
scheduling; Scheduling algorithm; stack frame
characteristics; Statistical distributions; storage
allocation; thread scheduling; virtual machines;
Virtual machining; Workload Characterization",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Desai:2009:AIC",
web-of-science-categories = "Computer Science, Hardware \&
author = "Joseph Devietti and Brandon Lucia and Luis Ceze and
Mark Oskin",
title = "{DMP}: deterministic shared memory multiprocessing",
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "85--96",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508244.1508255",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Current shared memory multicore and multiprocessor
systems are nondeterministic. Each time these systems
execute a multithreaded application, even if supplied
with the same input, they can produce a different
output. This frustrates debugging and limits the
ability to properly test multithreaded code, becoming a
major stumbling block to the much-needed widespread
adoption of parallel programming.\par
In this paper we make the case for fully deterministic
shared memory multiprocessing (DMP). The behavior of an
arbitrary multithreaded program on a DMP system is only
a function of its inputs. The core idea is to make
inter-thread communication fully deterministic.
Previous approaches to coping with nondeterminism in
multithreaded programs have focused on replay, a
technique useful only for debugging. In contrast, while
DMP systems are directly useful for debugging by
offering repeatability by default, we argue that
parallel programs should execute deterministically in
the field as well. This has the potential to make
testing more assuring and increase the reliability of
deployed multithreaded software. We propose a range of
approaches to enforcing determinism and discuss their
implementation trade-offs. We show that determinism can
be provided with little performance cost using our
architecture proposals on future hardware, and that
software-only approaches can be utilized on existing
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "debugging; determinism; multicores; parallel
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Memory-level parallelism aware fetch policies for
simultaneous multithreading processors",
journal = j-TACO,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509867",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "A thread executing on a simultaneous multithreading
(SMT) processor that experiences a long-latency load
will eventually stall while holding execution
resources. Existing long-latency load aware SMT fetch
policies limit the amount of resources allocated by a
stalled thread by identifying long-latency loads and
preventing the thread from fetching more instructions
--- and in some implementations, instructions beyond
the long-latency load are flushed to release allocated
This article proposes an SMT fetch policy that takes
into account the available memory-level parallelism
(MLP) in a thread. The key idea proposed in this
article is that in case of an isolated long-latency
load (i.e., there is no MLP), the thread should be
prevented from allocating additional resources.
However, in case multiple independent long-latency
loads overlap (i.e., there is MLP), the thread should
allocate as many resources as needed in order to fully
expose the available MLP. MLP-aware fetch policies
achieve better performance for MLP-intensive threads on
SMT processors, leading to higher overall system
throughput and shorter average turnaround time than
previously proposed fetch policies.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "Fetch Policy; Memory-Level Parallelism (MLP);
Simultaneous Multithreading (SMT)",
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Per-thread cycle accounting in {SMT} processors",
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "133--144",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508284.1508260",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper proposes a cycle accounting architecture
for Simultaneous Multithreading (SMT) processors that
estimates the execution times for each of the threads
had they been executed alone, while they are running
simultaneously on the SMT processor. This is done by
accounting each cycle to either a base, miss event or
waiting cycle component during multi-threaded
execution. Single-threaded alone execution time is then
estimated as the sum of the base and miss event
components; the waiting cycle component represents the
lost cycle count due to SMT execution. The cycle
accounting architecture incurs reasonable hardware cost
(around 1KB of storage) and estimates single-threaded
performance with average prediction errors around 7.2\%
for two-program workloads and 11.7\% for four-program
The cycle accounting architecture has several important
applications to system software and its interaction
with SMT hardware. For one, the estimated single-thread
alone execution time provides an accurate picture to
system software of the actually consumed processor
cycles per thread. The alone execution time instead of
the total execution time (timeslice) may make system
software scheduling policies more effective. Second, a
new class of thread-progress aware SMT fetch policies
based on per-thread progress indicators enable system
software level priorities to be enforced at the
hardware level.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "cycle accounting; simultaneous multithreading (SMT);
thread-progress aware fetch policy",
author = "Cormac Flanagan and Stephen N. Freund",
title = "{FastTrack}: efficient and precise dynamic race
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "121--133",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1542476.1542490",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib;
abstract = "Multithreaded programs are notoriously prone to race
conditions. Prior work on dynamic race detectors
includes fast but imprecise race detectors that report
false alarms, as well as slow but precise race
detectors that never report false alarms. The latter
typically use expensive vector clock operations that
require time linear in the number of program
This paper exploits the insight that the full
generality of vector clocks is unnecessary in most
cases. That is, we can replace heavyweight vector
clocks with an adaptive lightweight representation
that, for almost all operations of the target program,
requires only constant space and supports constant-time
operations. This representation change significantly
improves time and space performance, with no loss in
Experimental results on Java benchmarks including the
Eclipse development environment show that our FastTrack
race detector is an order of magnitude faster than a
traditional vector-clock race detector, and roughly
twice as fast as the high-performance DJIT+ algorithm.
FastTrack is even comparable in speed to Eraser on our
Java benchmarks, while never reporting false alarms.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; dynamic analysis; race conditions",
author = "Wilson W. L. Fung and Ivan Sham and George Yuan and
Tor M. Aamodt",
title = "Dynamic warp formation: {Efficient MIMD} control flow
on {SIMD} graphics hardware",
journal = j-TACO,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543756",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Recent advances in graphics processing units (GPUs)
have resulted in massively parallel hardware that is
easily programmable and widely available in today's
desktop and notebook computer systems. GPUs typically
use single-instruction, multiple-data (SIMD) pipelines
to achieve high performance with minimal overhead for
control hardware. Scalar threads running the same
computing kernel are grouped together into SIMD
batches, sometimes referred to as warps. While SIMD is
ideally suited for simple programs, recent GPUs include
control flow instructions in the GPU instruction set
architecture and programs using these instructions may
experience reduced performance due to the way branch
execution is supported in hardware. One solution is to
add a stack to allow different SIMD processing elements
to execute distinct program paths after a branch
instruction. The occurrence of diverging branch
outcomes for different processing elements
significantly degrades performance using this approach.
In this article, we propose dynamic warp formation and
scheduling, a mechanism for more efficient SIMD branch
execution on GPUs. It dynamically regroups threads into
new warps on the fly following the occurrence of
diverging branch outcomes. We show that a realistic
hardware implementation of this mechanism improves
performance by 13\%, on average, with 256 threads per
core, 24\% with 512 threads, and 47\% with 768 threads
for an estimated area increase of 8\%.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "control flow; fine-grained multithreading; GPU; SIMD",
author = "Ron Gabor and Avi Mendelson and Shlomo Weiss",
title = "Service level agreement for multithreaded processors",
journal = j-TACO,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543755",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multithreading is widely used to increase processor
throughput. As the number of shared resources increase,
managing them while guaranteeing predicted performance
becomes a major problem. Attempts have been made in
previous work to ease this via different fairness
mechanisms. In this article, we present a new approach
to control the resource allocation and sharing via a
service level agreement (SLA)-based mechanism; that is,
via an agreement in which multithreaded processors
guarantee a minimal level of service to the running
threads. We introduce a new metric, {\em C\/}$_{SLA}$,
for conformance to SLA in multithreaded processors and
show that controlling resources using with SLA allows
for higher gains than are achievable by previously
suggested fairness techniques. It also permits
improving one metric (e.g., power) while maintaining
SLA in another (e.g., performance). We compare SLA
enforcement to schemes based on other fairness metrics,
which are mostly targeted at equalizing execution
parameters. We show that using SLA rather than fairness
based algorithms provides a range of acceptable
execution points from which we can select the point
that best fits our optimization target, such as
maximizing the weighted speedup (sum of the speedups of
the individual threads) or reducing power. We
demonstrate the effectiveness of the new SLA approach
using switch-on-event (coarse-grained) multithreading.
Our weighted speedup improvement scheme successfully
enforces SLA while improving the weighted speedup by an
average of 10\% for unbalanced threads. This result is
significant when compared with performance losses that
may be incurred by fairness enforcement methods. When
optimizing for power reduction in unbalanced threads
SLA enforcement reduces the power by an average of
15\%. SLA may be complemented by other power reduction
methods to achieve further power savings {\em and\/}
maintain the same service level for the threads. We
also demonstrate differentiated SLA, where weighted
speedup is maximized while each thread may have a
different throughput constraint.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "fairness; performance; power; Service level agreement;
author = "Pierre Ganty and Rupak Majumdar and Andrey
title = "Verifying liveness for asynchronous programs",
journal = j-SIGPLAN,
volume = "44",
number = "1",
pages = "102--113",
month = jan,
year = "2009",
DOI = "https://doi.org/10.1145/1594834.1480895",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:38 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Asynchronous or 'event-driven' programming is a
popular technique to efficiently and flexibly manage
concurrent interactions. In these programs, the
programmer can post tasks that get stored in a task
buffer and get executed atomically by a non-preemptive
scheduler at a future point. We give a decision
procedure for the fair termination property of
asynchronous programs. The fair termination problem
asks, given an asynchronous program and a fairness
condition on its executions, does the program always
terminate on fair executions? The fairness assumptions
rule out certain undesired bad behaviors, such as where
the scheduler ignores a set of posted tasks forever, or
where a non-deterministic branch is always chosen in
one direction. Since every liveness property reduces to
a fair termination property, our decision procedure
extends to liveness properties of asynchronous
programs. Our decision procedure for the fair
termination of asynchronous programs assumes all
variables are finite-state. Even though variables are
finite-state, asynchronous programs can have an
unbounded stack from recursive calls made by tasks, as
well as an unbounded task buffer of pending tasks. We
show a reduction from the fair termination problem for
asynchronous programs to fair termination problems on
Petri Nets, and our main technical result is a
reduction of the latter problem to Presburger
satisfiability. Our decidability result is in contrast
to multithreaded recursive programs, for which liveness
properties are undecidable. While we focus on fair
termination, we show our reduction to Petri Nets can be
used to prove related properties such as fair
nonstarvation (every posted task is eventually
executed) and safety properties such as boundedness
(find a bound on the maximum number of posted tasks
that can be in the task buffer at any point).",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "asynchronous (event-driven) programming; fair
termination; liveness; Petri nets",
author = "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel
title = "A novel parallel {$ Q R $} algorithm for hybrid
distributed memory {HPC} systems",
type = "LAPACK Working Note",
number = "216",
institution = "Department of Computing Science and HPC2N",
address = "Ume{\aa} University, S-901 Ume{\aa}, Sweden",
month = apr,
year = "2009",
bibdate = "Fri Apr 24 12:25:43 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf",
abstract = "A novel variant of the parallel QR algorithm for
solving dense nonsymmetric eigenvalue problems on
hybrid distributed high performance computing (HPC)
systems is presented. For this purpose, we introduce
the concept of multi-window bulge chain chasing and
parallelize aggressive early deflation. The
multi-window approach ensures that most computations
when chasing chains of bulges are performed in level 3
BLAS operations, while the aim of aggressive early
deflation is to speed up the convergence of the QR
algorithm. Mixed MPI-OpenMP coding techniques are
utilized for porting the codes to distributed memory
platforms with multithreaded nodes, such as multicore
processors. Numerous numerical experiments confirm the
superior performance of our parallel QR algorithm in
comparison with the existing ScaLAPACK code, leading to
an implementation that is one to two orders of
magnitude faster for sufficiently large problems,
including a number of examples from applications.",
acknowledgement = ack-nhfb,
keywords = "aggressive early deflation; bulge chasing; Eigenvalue
problem; hybrid distributed memory systems.; level 3
performance; multishift; nonsymmetric QR algorithm;
parallel algorithms; parallel computations",
utknumber = "UMINF-09.06",
author = "Ryan E. Grant and Ahmad Afsahi",
title = "Improving energy efficiency of asymmetric chip
multithreaded multiprocessors through reduced {OS}
noise scheduling",
journal = j-CCPE,
volume = "21",
number = "18",
pages = "2355--2376",
day = "25",
month = dec,
year = "2009",
DOI = "https://doi.org/10.1002/cpe.1454",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:40 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "8 Jul 2009",
author = "Zvika Guz and Evgeny Bolotin and Idit Keidar and
Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
title = "Many-Core vs. Many-Thread Machines: Stay Away From the
volume = "8",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "We study the tradeoffs between Many-Core machines like
Intel's Larrabee and Many-Thread machines like Nvidia
and AMD GPGPUs. We define a unified model describing a
superposition of the two architectures, and use it to
identify operation zones for which each machine is more
suitable. Moreover, we identify an intermediate zone in
which both machines deliver inferior performance. We
study the shape of this ``performance valley'' and
provide insights on how it can be avoided.",
acknowledgement = ack-nhfb,
affiliation = "Guz, Z (Reprint Author), Technion Israel Inst Technol,
EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
idish@ee.technion.ac.il kolodny@ee.technion.ac.il
avim@microsoft.com uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductors Research Corporation (SRC);
Intel; Israeli Ministry of Science Knowledge Center on
Chip MultiProcessors",
funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
This work was partially supported by Semiconductors
Research Corporation (SRC), Intel, and the Israeli
Ministry of Science Knowledge Center on Chip
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD GPGPU; architecture superposition; Bandwidth; Chip
Multiprocessors; Computer Systems; coprocessors; Delay;
Engines; Equations; GPGPU; Graphics; Intelpsilas
Larrabee; many-core machines; many-thread machines;
Multi-core/single-chip multiprocessors;
multi-threading; multiprocessing systems; Nvidia GPGPU;
Parallel Architectures; parallel architectures;
Parallel processing; performance valley; Processor
Architectures; Shape",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "27",
unique-id = "Guz:2009:MCV",
web-of-science-categories = "Computer Science, Hardware \&
author = "Kevin J. Hoffman and Patrick Eugster and Suresh
title = "Semantics-aware trace analysis",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "453--464",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1542476.1542527",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As computer systems continue to become more powerful
and complex, so do programs. High-level abstractions
introduced to deal with complexity in large programs,
while simplifying human reasoning, can often obfuscate
salient program properties gleaned from automated
source-level analysis through subtle (often non-local)
interactions. Consequently, understanding the effects
of program changes and whether these changes violate
intended protocols become difficult to infer.
Refactorings, and feature additions, modifications, or
removals can introduce hard-to-catch bugs that often go
undetected until many revisions later.\par
To address these issues, this paper presents a novel
dynamic program analysis that builds a {\em semantic
view\/} of program executions. These views reflect
program abstractions and aspects; however, views are
not simply projections of execution traces, but are
linked to each other to capture semantic interactions
among abstractions at different levels of granularity
in a scalable manner.\par
We describe our approach in the context of Java and
demonstrate its utility to improve {\em regression
analysis}. We first formalize a subset of Java and a
grammar for traces generated at program execution. We
then introduce several types of views used to analyze
regression bugs along with a novel, scalable technique
for semantic differencing of traces from different
versions of the same program. Benchmark results on
large open-source Java programs demonstrate that
semantic-aware trace differencing can identify precise
and useful details about the underlying cause for a
regression, even in programs that use reflection,
multithreading, or dynamic code generation, features
that typically confound other analysis techniques.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "automated debugging; debugging aids; semantic tracing;
testing tools; trace views; tracing",
author = "Pallavi Joshi and Chang-Seo Park and Koushik Sen and
Mayur Naik",
title = "A randomized dynamic program analysis technique for
detecting real deadlocks",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "110--120",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1543135.1542489",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present a novel dynamic analysis technique that
finds real deadlocks in multi-threaded programs. Our
technique runs in two stages. In the first stage, we
use an imprecise dynamic analysis technique to find
potential deadlocks in a multi-threaded program by
observing an execution of the program. In the second
stage, we control a random thread scheduler to create
the potential deadlocks with high probability. Unlike
other dynamic analysis techniques, our approach has the
advantage that it does not give any false warnings. We
have implemented the technique in a prototype tool for
Java, and have experimented on a number of large
multi-threaded Java programs. We report a number of
previously known and unknown real deadlocks that were
found in these benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "active testing; concurrency; deadlock detection;
dynamic program analysis",
author = "Arun Kejariwal and Alexander V. Veidenbaum and
Alexandru Nicolau and Milind Girkar and Xinmin Tian and
Hideki Saito",
title = "On the exploitation of loop-level parallelism in
embedded applications",
journal = j-TECS,
volume = "8",
number = "2",
pages = "10:1--10:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1457255.1457257",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Feb 5 19:15:05 MST 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Advances in the silicon technology have enabled
increasing support for hardware parallelism in embedded
processors. Vector units, multiple processors/cores,
multithreading, special-purpose accelerators such as
DSPs or cryptographic engines, or a combination of the
above have appeared in a number of processors. They
serve to address the increasing performance
requirements of modern embedded applications. To what
extent the available hardware parallelism can be
exploited is directly dependent on the amount of
parallelism inherent in the given application and the
congruence between the granularity of hardware and
application parallelism. This paper discusses how
loop-level parallelism in embedded applications can be
exploited in hardware and software. Specifically, it
evaluates the efficacy of automatic loop
parallelization and the performance potential of
different types of parallelism, viz., true thread-level
parallelism (TLP), speculative thread-level parallelism
and vector parallelism, when executing loops.
Additionally, it discusses the interaction between
parallelization and vectorization. Applications from
both the industry-standard EEMBC{\reg},$^1$ 1.1, EEMBC
2.0 and the academic MiBench embedded benchmark suites
are analyzed using the Intel{\reg}$^2$ C compiler. The
results show the performance that can be achieved today
on real hardware and using a production compiler,
provide upper bounds on the performance potential of
the different types of thread-level parallelism, and
point out a number of issues that need to be addressed
to improve performance. The latter include
parallelization of libraries such as libc and design of
parallel algorithms to allow maximal exploitation of
parallelism. The results also point to the need for
developing new benchmark suites more suitable to
parallel compilation and execution.\par
$^1$ Other names and brands may be claimed as the
property of others.\par
$^2$ Intel is a trademark of Intel Corporation or its
subsidiaries in the United States and other
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
keywords = "libraries; Multi-cores; multithreading; parallel
loops; programming models; system-on-chip (Soc);
thread-level speculation; vectorization",
author = "Arun Kejariwal and Calin Cas{\c{c}}aval",
title = "Parallelization spectroscopy: analysis of thread-level
parallelism in {HPC} programs",
journal = j-SIGPLAN,
volume = "44",
number = "4",
pages = "293--294",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1594835.1504221",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we present a method --- parallelization
spectroscopy --- for analyzing the thread-level
parallelism available in production High Performance
Computing (HPC) codes. We survey a number of techniques
that are commonly used for parallelization and classify
all the loops in the case study presented using a
sensitivity metric: how likely is a particular
technique is successful in parallelizing the loop.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "loop transformations; multithreading; parallelism",
author = "K. Kunal and K. George and M. Gautam and V. Kamakoti",
title = "{HTM} design spaces: complete decoupling from caches
and achieving highly concurrent transactions",
journal = j-OPER-SYS-REV,
volume = "43",
number = "2",
pages = "98--99",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1531793.1531809",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Thu Apr 23 19:43:22 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper proposes a Hardware Transactional Memory
(HTM) design for multi-core environments. Using a novel
technique to keep track of transactional read-write
entries, the design provides a holistic and scalable
solution to Transactional Memory (TM) implementation
issues of context switching, process migration and
overflow handling. Another aspect of the design is that
it allows transactions to run in a highly concurrent
manner by using special techniques to handle conflict
resolution, conflict detection and overflows. The
feasibility and validity of the proposed design are
demonstrated by developing a synthesizable Hardware
Description Language (HDL) model of the design and also
experimenting on the same with standard benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
keywords = "context switching; hardware transactional memory;
multi-threaded cores; operating systems; overflow
handling; process migration",
author = "Jakub Kurzak and Hatem Ltaief and Jack Dongarra and
Rosa M. Badia",
title = "Scheduling Linear Algebra Operations on Multicore
type = "LAPACK Working Note",
number = "213",
institution = inst-UT-CS,
address = inst-UT-CS:adr,
month = feb,
year = "2009",
bibdate = "Fri Apr 24 12:25:43 2009",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
URL = "http://www.netlib.org/lapack/lawnspdf/lawn213.pdf",
abstract = "We present performance results for dense linear
algebra using the 8-series NVIDIA GPUs. Our
matrix-matrix multiply routine (GEMM) runs 60\% faster
than the vendor implementation in CUBLAS 1.1 and
approaches the peak of hardware capabilities. Our LU,
QR and Cholesky factorizations achieve up to 80--90\%
of the peak GEMM rate. Our parallel LU running on two
GPUs achieves up to $ \approx $300 Gflop/s. These
results are accomplished by challenging the accepted
view of the GPU architecture and
programming guidelines. We argue that modern GPUs
should be viewed as multithreaded multicore vector
units. We exploit blocking similarly to vector
computers and heterogeneity of the system by computing
both on GPU and CPU. This study includes detailed
benchmarking of the GPU memory system that reveals
sizes and latencies of caches and TLB. We present a
couple of algorithmic optimizations aimed at increasing
parallelism and regularity in the problem that provide
us with slightly higher performance.",
acknowledgement = ack-nhfb,
keywords = "Cholesky; factorization; linear algebra; LU;
multicore; QR; scheduling; task graph",
utknumber = "UT-CS-09-636",
author = "Taehee Lee and Tobias H{\"o}llerer",
title = "Multithreaded Hybrid Feature Tracking for Markerless
Augmented Reality",
volume = "15",
number = "3",
pages = "355--368",
month = may # "\slash " # jun,
year = "2009",
DOI = "https://doi.org/10.1109/TVCG.2008.190",
ISSN = "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
ISSN-L = "1077-2626",
bibdate = "Thu Jul 2 10:22:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Visualization and Computer
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
author = "Andrew Lenharth and Vikram S. Adve and Samuel T.
title = "Recovery domains: an organizing principle for
recoverable operating systems",
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "49--60",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508284.1508251",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We describe a strategy for enabling existing commodity
operating systems to recover from unexpected run-time
errors in nearly any part of the kernel, including core
kernel components. Our approach is dynamic and
request-oriented; it isolates the effects of a fault to
the requests that caused the fault rather than to
static kernel components. This approach is based on a
notion of 'recovery domains,' an organizing principle
to enable rollback of state affected by a request in a
multithreaded system with minimal impact on other
requests or threads. We have applied this approach on
v2.4.22 and v2.6.27 of the Linux kernel and it required
132 lines of changed or new code: the other changes are
all performed by a simple instrumentation pass of a
compiler. Our experiments show that the approach is
able to recover from otherwise fatal faults with
minimal collateral impact during a recovery event.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "akeso; automatic fault recovery; recovery domains",
author = "Enno L{\"u}bbers and Marco Platzner",
title = "{ReconOS}: {Multithreaded} programming for
reconfigurable computers",
journal = j-TECS,
volume = "9",
number = "1",
pages = "8:1--8:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Mon Mar 15 18:40:57 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
author = "Carlos Madriles and Pedro L{\'o}pez and Josep M.
Codina and Enric Gibert and Fernando Latorre and
Alejandro Martinez and Ra{\'u}l Martinez and Antonio
title = "Boosting single-thread performance in multi-core
systems through fine-grain multi-threading",
journal = j-COMP-ARCH-NEWS,
volume = "37",
number = "3",
pages = "474--483",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1555754.1555813",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Aug 11 18:12:55 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Industry has shifted towards multi-core designs as we
have hit the memory and power walls. However, single
thread performance remains of paramount importance
since some applications have limited thread-level
parallelism (TLP), and even a small part with limited
TLP impose important constraints to the global
performance, as explained by Amdahl's law.\par
In this paper we propose a novel approach for
leveraging multiple cores to improve single-thread
performance in a multi-core design. The proposed
technique features a set of novel hardware mechanisms
that support the execution of threads generated at
compile time. These threads result from a fine-grain
speculative decomposition of the original application
and they are executed under a modified multi-core
system that includes: (1) mechanisms to support
multiple versions; (2) mechanisms to detect violations
among threads; (3) mechanisms to reconstruct the
original sequential order; and (4) mechanisms to
checkpoint the architectural state and recovery to
handle misspeculations.\par
The proposed scheme outperforms previous hardware-only
schemes to implement the idea of combining cores for
executing single-thread applications in a multi-core
design by more than 10\% on average on Spec2006 for all
configurations. Moreover, single-thread performance is
improved by 41\% on average when the proposed scheme is
used on a Tiny Core, and up to 2.6x for some selected
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "automatic parallelization; core-fusion; multicore;
single-thread performance; speculative multithreading;
thread-level parallelism",
author = "Daniel Marino and Madanlal Musuvathi and Satish
title = "{LiteRace}: effective sampling for lightweight
data-race detection",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "134--143",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1542476.1542491",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Data races are one of the most common and subtle
causes of pernicious concurrency bugs. Static
techniques for preventing data races are overly
conservative and do not scale well to large programs.
Past research has produced several dynamic data race
detectors that can be applied to large programs. They
are precise in the sense that they only report actual
data races. However, dynamic data race detectors incur
a high performance overhead, slowing down a program's
execution by an order of magnitude.\par
In this paper we present LiteRace, a very lightweight
data race detector that samples and analyzes only
selected portions of a program's execution. We show
that it is possible to sample a multithreaded program
at a low frequency, and yet, find infrequently
occurring data races. We implemented LiteRace using
Microsoft's Phoenix compiler. Our experiments with
several Microsoft programs, Apache, and Firefox show
that LiteRace is able to find more than 70\% of data
races by sampling less than 2\% of memory accesses in a
given program execution.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency bugs; dynamic data race detection;
author = "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n
and Daniel Ortega and Paolo Faraboschi",
title = "How to simulate 1000 cores",
journal = j-COMP-ARCH-NEWS,
volume = "37",
number = "2",
pages = "10--19",
month = may,
year = "2009",
DOI = "https://doi.org/10.1145/1577129.1577133",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Aug 11 18:12:39 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper proposes a novel methodology to efficiently
simulate shared-memory multiprocessors composed of
hundreds of cores. The basic idea is to use
thread-level parallelism in the software system and
translate it into core-level parallelism in the
simulated world. To achieve this, we first augment an
existing full-system simulator to identify and separate
the instruction streams belonging to the different
software threads. Then, the simulator dynamically maps
each instruction flow to the corresponding core of the
target multi-core architecture, taking into account the
inherent thread synchronization of the running
applications. Our simulator allows a user to execute
any multithreaded application in a conventional
full-system simulator and evaluate the performance of
the application on a many-core hardware. We carried out
extensive simulations on the SPLASH-2 benchmark suite
and demonstrated the scalability up to 1024 cores with
limited simulation speed degradation vs. the
single-core case on a fixed workload. The results also
show that the proposed technique captures the intrinsic
behavior of the SPLASH-2 suite, even when we scale up
the number of shared-memory cores beyond the
thousand-core limit.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jayanta Mukherjee and Soumyendu Raha",
title = "Power-aware Speed-up for Multithreaded Numerical
Linear Algebraic Solvers on Chip Multicore Processors",
journal = j-SCPE,
volume = "10",
number = "2",
pages = "217--228",
month = jun,
year = "2009",
CODEN = "????",
ISSN = "1895-1767",
bibdate = "Thu Sep 2 11:55:11 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.scpe.org/vols/vol10/no2/SCPE_10_2_07.pdf;
acknowledgement = ack-nhfb,
author = "Enric Musoll",
title = "Leakage-saving opportunities in mesh-based massive
multi-core architectures",
journal = j-COMP-ARCH-NEWS,
volume = "37",
number = "5",
pages = "1--7",
month = dec,
year = "2009",
DOI = "https://doi.org/10.1145/1755235.1755237",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Apr 8 18:42:25 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "When processing multi-threaded workloads requiring
significant inter-thread communication, opportunities
to reduce power consumption arise due to the large
latencies in obtaining data from the threads running on
remote cores and the lack of architectural resources
implemented in the simple cores to cover these
In this work we propose to use the drowsy mode
technique to save leakage power on the cores and
leverage the mesh-based communication fabric to hide
the wake-up latency of the core blocks. We have
observed a potential for reducing the overall power of
around 70\% in a generic homogeneous 256-core
tile-based multi-core architecture.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Iulian Neamtiu and Michael Hicks",
title = "Safe and timely updates to multi-threaded programs",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "13--24",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1543135.1542479",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Many dynamic updating systems have been developed that
enable a program to be patched while it runs, to fix
bugs or add new features. This paper explores
techniques for supporting dynamic updates to
multi-threaded programs, focusing on the problem of
applying an update in a timely fashion while still
producing correct behavior. Past work has shown that
this tension of {\em safety\/} versus timeliness can be
balanced for single-threaded programs. For
multi-threaded programs, the task is more difficult
because myriad thread interactions complicate
understanding the possible program states to which a
patch could be applied. Our approach allows the
programmer to specify a few program points (e.g., one
per thread) at which a patch may be applied, which
simplifies reasoning about safety. To improve
timeliness, a combination of static analysis and
run-time support automatically expands these few points
to many more that produce behavior equivalent to the
originals. Experiments with thirteen realistic updates
to three multi-threaded servers show that we can safely
perform a dynamic update within milliseconds when more
straightforward alternatives would delay some updates
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "dynamic software updating; multi-threading; update
safety; update timeliness",
author = "Alexandru Nicolau and Guangqiang Li and Arun
title = "Techniques for efficient placement of synchronization
journal = j-SIGPLAN,
volume = "44",
number = "4",
pages = "199--208",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1504176.1504207",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Harnessing the hardware parallelism of the emerging
multi-cores systems necessitates concurrent software.
Unfortunately, most of the existing mainstream software
is sequential in nature. Although one could
auto-parallelize a given program, the efficacy of this
is largely limited to floating-point codes. One of the
ways to alleviate the above limitation is to
parallelize programs, which cannot be
auto-parallelized, via explicit synchronization. In
this regard, efficient placement of the synchronization
primitives --- say, post, wait --- plays a key role in
achieving high degree of thread-level parallelism ({\em
TLP\/}). In this paper, we propose novel compiler
techniques for the above. Specifically, given a control
flow graph ({\em CFG\/}), the proposed techniques place
a post as early as possible and place a wait as late as
possible in the CFG, subject to dependences. We
demonstrate the efficacy of our techniques, on a real
machine, using real codes, specifically, from the
industry-standard SPEC CPU benchmarks, the Linux kernel
and other widely used open source codes. Our results
show that the proposed techniques yield significantly
higher levels of TLP than the state-of-the-art.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "compilers; multithreading; parallelization;
author = "Marek Olszewski and Jason Ansel and Saman
title = "{Kendo}: efficient deterministic multithreading in
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "97--108",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508244.1508256",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Although chip-multiprocessors have become the industry
standard, developing parallel applications that target
them remains a daunting task. Non-determinism, inherent
in threaded applications, causes significant challenges
for parallel programmers by hindering their ability to
create parallel applications with repeatable results.
As a consequence, parallel applications are
significantly harder to debug, test, and maintain than
sequential programs.\par
This paper introduces Kendo: a new software-only system
that provides deterministic multithreading of parallel
applications. Kendo enforces a deterministic
interleaving of lock acquisitions and specially
declared non-protected reads through a novel
dynamically load-balanced deterministic scheduling
algorithm. The algorithm tracks the progress of each
thread using performance counters to construct a
deterministic logical time that is used to compute an
interleaving of shared data accesses that is both
deterministic and provides good load balancing. Kendo
can run on today's commodity hardware while incurring
only a modest performance cost. Experimental results on
the SPLASH-2 applications yield a geometric mean
overhead of only 16\% when running on 4 processors.
This low overhead makes it possible to benefit from
Kendo even after an application is deployed.
Programmers can start using Kendo today to program
parallel applications that are easier to develop,
debug, and test.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "debugging; determinism; deterministic multithreading;
multicore; parallel programming",
author = "J. C. Pichel and D. B. Heras and J. C. Cabaleiro and
F. F. Rivera",
title = "Increasing data reuse of sparse algebra codes on
simultaneous multithreading architectures",
journal = j-CCPE,
volume = "21",
number = "15",
pages = "1838--1856",
month = oct,
year = "2009",
DOI = "https://doi.org/10.1002/cpe.1404",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:38 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "11 Feb 2009",
author = "Harald Piringer and Christian Tominski and Philipp
Muigg and Wolfgang Berger",
title = "A Multi-Threading Architecture to Support Interactive
Visual Exploration",
volume = "15",
number = "6",
pages = "1113--1120",
month = nov # "\slash " # dec,
year = "2009",
DOI = "https://doi.org/10.1109/TVCG.2009.110",
ISSN = "1077-2626 (print), 1941-0506 (electronic), 2160-9306",
ISSN-L = "1077-2626",
bibdate = "Thu May 13 17:38:49 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Visualization and Computer
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
author = "Gregorio Quintana-Ort{\'\i} and Enrique S.
Quintana-Ort{\'\i} and Robert A. {Van De Geijn} and
Field G. {Van Zee} and Ernie Chan",
title = "Programming matrix algorithms-by-blocks for
thread-level parallelism",
journal = j-TOMS,
volume = "36",
number = "3",
pages = "14:1--14:26",
month = jul,
year = "2009",
DOI = "https://doi.org/10.1145/1527286.1527288",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Tue Jul 21 14:09:07 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "With the emergence of thread-level parallelism as the
primary means for continued performance improvement,
the programmability issue has reemerged as an obstacle
to the use of architectural advances. We argue that
evolving legacy libraries for dense and banded linear
algebra is not a viable solution due to constraints
imposed by early design decisions. We propose a
philosophy of abstraction and separation of concerns
that provides a promising solution in this problem
domain. The first abstraction, FLASH, allows algorithms
to express computation with matrices consisting of
contiguous blocks, facilitating algorithms-by-blocks.
Operand descriptions are registered for a particular
operation a priori by the library implementor. A
runtime system, SuperMatrix, uses this information to
identify data dependencies between suboperations,
allowing them to be scheduled to threads out-of-order
and executed in parallel. But not all classical
algorithms in linear algebra lend themselves to
conversion to algorithms-by-blocks. We show how our
recently proposed LU factorization with incremental
pivoting and a closely related algorithm-by-blocks for
the QR factorization, both originally designed for
out-of-core computation, overcome this difficulty.
Anecdotal evidence regarding the development of
routines with a core functionality demonstrates how the
methodology supports high productivity while
experimental results suggest that high performance is
abundantly achievable.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
keywords = "high-performance; libraries; Linear algebra;
multithreaded architectures",
author = "P. Raghavan and A. Lambrechts and M. Jayapala and F.
Catthoor and D. Verkest",
title = "Distributed Loop Controller for Multithreading in
Unithreaded {ILP} Architectures",
journal = j-IEEE-TRANS-COMPUT,
volume = "58",
number = "3",
pages = "311--321",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1109/TC.2008.168",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Mon Jul 4 11:37:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4624249",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Paruj Ratanaworabhan and Martin Burtscher and Darko
Kirovski and Benjamin Zorn and Rahul Nagpal and Karthik
title = "Detecting and tolerating asymmetric races",
journal = j-SIGPLAN,
volume = "44",
number = "4",
pages = "173--184",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1504176.1504202",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper introduces ToleRace, a runtime system that
allows programs to detect and even tolerate asymmetric
data races. Asymmetric races are race conditions where
one thread correctly acquires and releases a lock for a
shared variable while another thread improperly
accesses the same variable. ToleRace provides
approximate isolation in the critical sections of
lock-based parallel programs by creating a local copy
of each shared variable when entering a critical
section, operating on the local copies, and propagating
the appropriate copies upon leaving the critical
section. We start by characterizing all possible
interleavings that can cause races and precisely
describe the effect of ToleRace in each case. Then, we
study the theoretical aspects of an oracle that knows
exactly what type of interleaving has occurred.
Finally, we present two software implementations of
ToleRace and evaluate them on multithreaded
applications from the SPLASH2 and PARSEC suites. Our
implementation on top of a dynamic instrumentation
tool, which works directly on executables and requires
no source code modifications, incurs an overhead of a
factor of two on average. Manually adding ToleRace to
the source code of these applications results in an
average overhead of 6.4 percent.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "dynamic instrumentation; race detection and
toleration; runtime support",
author = "Elvinia Riccobene and Patrizia Scandurra and Sara
Bocchio and Alberto Rosti and Luigi Lavazza and Luigi
title = "{SystemC\slash C-based} model-driven design for
embedded systems",
journal = j-TECS,
volume = "8",
number = "4",
pages = "30:1--30:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1550987.1550993",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jul 23 12:32:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This article summarizes our effort, since 2004 up to
the present time, for improving the current industrial
Systems-on-Chip and Embedded Systems design by joining
the capabilities of the unified modeling language (UML)
and SystemC/C programming languages to operate at
system-level. The proposed approach exploits the OMG
model-driven architecture --- a framework for
Model-driven Engineering --- capabilities of reducing
abstract, coarse-grained and platform-independent
system models to fine-grained and platform-specific
models. We first defined a design methodology and a
development flow for the hardware, based on a SystemC
UML profile and encompassing different levels of
abstraction. We then included a multithread C UML
profile for modelling software applications. Both
SystemC/C profiles are consistent sets of modelling
constructs designed to lift the programming features
(both structural and behavioral) of the two coding
languages to the UML modeling level. The new codesign
flow is supported by an environment, which allows
system modeling at higher abstraction levels (from a
functional executable level to a register transfer
level) and supports automatic
code-generation/back-annotation from/to UML models.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
keywords = "C; ES; MDE; SoC; SystemC; UML",
author = "Indrajit Roy and Donald E. Porter and Michael D. Bond
and Kathryn S. McKinley and Emmett Witchel",
title = "{Laminar}: practical fine-grained decentralized
information flow control",
journal = j-SIGPLAN,
volume = "44",
number = "6",
pages = "63--74",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1543135.1542484",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:41:16 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Decentralized information flow control (DIFC) is a
promising model for writing programs with powerful,
end-to-end security guarantees. Current DIFC systems
that run on commodity hardware can be broadly
categorized into two types: language-level and
operating system-level DIFC. Language level solutions
provide no guarantees against security violations on
system resources, like files and sockets. Operating
system solutions can mediate accesses to system
resources, but are inefficient at monitoring the flow
of information through fine-grained program data
This paper describes Laminar, the first system to
implement decentralized information flow control using
a single set of abstractions for OS resources and
heap-allocated objects. Programmers express security
policies by labeling data with secrecy and integrity
labels, and then access the labeled data in lexically
scoped security regions. Laminar enforces the security
policies specified by the labels at runtime. Laminar is
implemented using a modified Java virtual machine and a
new Linux security module. This paper shows that
security regions ease incremental deployment and limit
dynamic security checks, allowing us to retrofit DIFC
policies on four application case studies. Replacing
the applications' ad-hoc security policies changes less
than 10\% of the code, and incurs performance overheads
from 1\% to 56\%. Whereas prior DIFC systems only
support limited types of multithreaded programs,
Laminar supports a more general class of multithreaded
DIFC programs that can access heterogeneously labeled
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "information flow control; java virtual machine;
operating systems; security region",
author = "Stelios Sidiroglou and Oren Laadan and Carlos Perez
and Nicolas Viennot and Jason Nieh and Angelos D.
title = "{ASSURE}: automatic software self-healing using rescue
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "37--48",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508284.1508250",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Software failures in server applications are a
significant problem for preserving system availability.
We present ASSURE, a system that introduces rescue
points that recover software from unknown faults while
maintaining both system integrity and availability, by
mimicking system behavior under known error conditions.
Rescue points are locations in existing application
code for handling a given set of programmer-anticipated
failures, which are automatically repurposed and tested
for safely enabling fault recovery from a larger class
of (unanticipated) faults. When a fault occurs at an
arbitrary location in the program, ASSURE restores
execution to an appropriate rescue point and induces
the program to recover execution by virtualizing the
program's existing error-handling facilities. Rescue
points are identified using fuzzing, implemented using
a fast coordinated checkpoint-restart mechanism that
handles multi-process and multi-threaded applications,
and, after testing, are injected into production code
using binary patching. We have implemented an ASSURE
Linux prototype that operates without application
source code and without base operating system kernel
changes. Our experimental results on a set of
real-world server applications and bugs show that
ASSURE enabled recovery for all of the bugs tested with
fast recovery times, has modest performance overhead,
and provides automatic self-healing orders of magnitude
faster than current human-driven patch deployment
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "binary patching; checkpoint restart; error recovery;
reliable software; software self-healing",
author = "Seung Woo Son and Mahmut Kandemir and Mustafa Karakoy
and Dhruva Chakrabarti",
title = "A compiler-directed data prefetching scheme for chip
journal = j-SIGPLAN,
volume = "44",
number = "4",
pages = "209--218",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1504176.1504208",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Data prefetching has been widely used in the past as a
technique for hiding memory access latencies. However,
data prefetching in multi-threaded applications running
on chip multiprocessors (CMPs) can be problematic when
multiple cores compete for a shared on-chip cache (L2
or L3). In this paper, we (i) quantify the impact of
conventional data prefetching on shared caches in CMPs.
The experimental data collected using multi-threaded
applications indicates that, while data prefetching
improves performance in small number of cores, its
benefits reduce significantly as the number of cores is
increased, that is, it is not scalable; (ii) identify
harmful prefetches as one of the main contributors for
degraded performance with a large number of cores; and
(iii) propose and evaluate a compiler-directed data
prefetching scheme for shared on-chip cache based CMPs.
The proposed scheme first identifies program phases
using static compiler analysis, and then divides the
threads into groups within each phase and assigns a
customized prefetcher thread (helper thread) to each
group of threads. This helps to reduce the total number
of prefetches issued, prefetch overheads, and negative
interactions on the shared cache space due to data
prefetches, and more importantly, makes
compiler-directed prefetching a scalable optimization
for CMPs. Our experiments with the applications from
the SPEC OMP benchmark suite indicate that the proposed
scheme improves overall parallel execution latency by
18.3\% over the no-prefetch case and 6.4\% over the
conventional data prefetching scheme (where each core
prefetches its data independently), on average, when 12
cores are used. The corresponding average performance
improvements with 24 cores are 16.4\% (over the
no-prefetch case) and 11.7\% (over the conventional
prefetching case). We also demonstrate that the
proposed scheme is robust under a wide range of values
of our major simulation parameters, and the
improvements it achieves come very close to those that
can be achieved using an optimal scheme.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "chip multiprocessors; compiler; helper thread;
author = "M. Aater Suleman and Onur Mutlu and Moinuddin K.
Qureshi and Yale N. Patt",
title = "Accelerating critical section execution with
asymmetric multi-core architectures",
journal = j-SIGPLAN,
volume = "44",
number = "3",
pages = "253--264",
month = mar,
year = "2009",
DOI = "https://doi.org/10.1145/1508244.1508274",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Jun 16 14:39:26 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To improve the performance of a single application on
Chip Multiprocessors (CMPs), the application must be
split into threads which execute concurrently on
multiple cores. In multi-threaded applications,
critical sections are used to ensure that only one
thread accesses shared data at any given time. Critical
sections can serialize the execution of threads, which
significantly reduces performance and
This paper proposes Accelerated Critical Sections
(ACS), a technique that leverages the high-performance
core(s) of an Asymmetric Chip Multiprocessor (ACMP) to
accelerate the execution of critical sections. In ACS,
selected critical sections are executed by a
high-performance core, which can execute the critical
section faster than the other, smaller cores. As a
result, ACS reduces serialization: it lowers the
likelihood of threads waiting for a critical section to
finish. Our evaluation on a set of 12
critical-section-intensive workloads shows that ACS
reduces the average execution time by 34\% compared to
an equal-area 32T-core symmetric CMP and by 23\%
compared to an equal-area ACMP. Moreover, for 7 out of
the 12 workloads, ACS improves scalability by
increasing the number of threads at which performance
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "cmp; critical sections; heterogeneous cores; locks;
multi-core; parallel programming",
author = "G{\'e}rard Swinnen",
title = "Apprendre {\'a} programmer avec Python: objet,
multithreading, {\'e}v{\'e}nements, bases de
donn{\'e}es, programmation web, programmation
r{\'e}seau, Unicode",
publisher = pub-EYROLLES,
address = pub-EYROLLES:adr,
pages = "xviii + 341",
year = "2009",
LCCN = "????",
bibdate = "Thu Apr 16 12:00:29 MDT 2009",
bibsource = "carmin.sudoc.abes.fr:210/ABES-Z39-PUBLIC;
acknowledgement = ack-nhfb,
language = "French",
author = "Nathan R. Tallent and John M. Mellor-Crummey",
title = "Effective performance measurement and analysis of
multithreaded applications",
journal = j-SIGPLAN,
volume = "44",
number = "4",
pages = "229--240",
month = apr,
year = "2009",
DOI = "https://doi.org/10.1145/1504176.1504210",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:49 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Understanding why the performance of a multithreaded
program does not improve linearly with the number of
cores in a shared-memory node populated with one or
more multicore processors is a problem of growing
practical importance. This paper makes three
contributions to performance analysis of multithreaded
programs. First, we describe how to measure and
attribute {\em parallel idleness}, namely, where
threads are stalled and unable to work. This technique
applies broadly to programming models ranging from
explicit threading ({\em e.g.}, Pthreads) to
higher-level models such as Cilk and OpenMP. Second, we
describe how to measure and attribute {\em parallel
overhead\/} -- when a thread is performing
miscellaneous work other than executing the user's
computation. By employing a combination of compiler
support and post-mortem analysis, we incur no
measurement cost beyond normal profiling to glean this
information. Using {\em idleness\/} and {\em
overhead\/} metrics enables one to pinpoint areas of an
application where concurrency should be increased (to
reduce idleness), decreased (to reduce overhead), or
where the present parallelization is hopeless (where
idleness and overhead are both high). Third, we
describe how to measure and attribute arbitrary
performance metrics for high-level multithreaded
programming models, such as Cilk. This requires
bridging the gap between the expression of logical
concurrency in programs and its realization at run-time
as it is adaptively partitioned and scheduled onto a
pool of threads. We have prototyped these ideas in the
context of Rice University's HPCToolkit performance
tools. We describe our approach, implementation, and
experiences applying this approach to measure and
attribute work, idleness, and overhead in executions of
Cilk programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "call path profiling; hpctoolkit; multithreaded
programming models; performance analysis",
author = "Rajeev Thakur and William Gropp",
title = "Test suite for evaluating performance of multithreaded
{MPI} communication",
volume = "35",
number = "12",
pages = "608--617",
month = dec,
year = "2009",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Thu Sep 2 17:51:11 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Pascal Vander-Swalmen and Gilles Dequen and
Micha{\"e}l Krajecki",
title = "A Collaborative Approach for Multi-Threaded {SAT}
journal = j-INT-J-PARALLEL-PROG,
volume = "37",
number = "3",
pages = "324--342",
month = jun,
year = "2009",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Wed Sep 1 16:06:47 MDT 2010",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=324",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Xavier Vera and Jaume Abella and Javier Carretero and
Antonio Gonz{\'a}lez",
title = "Selective replication: a lightweight technique for
soft errors",
journal = j-TOCS,
volume = "27",
number = "4",
pages = "8:1--8:30",
month = dec,
year = "2009",
DOI = "https://doi.org/10.1145/1658357.1658359",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Mon Mar 15 09:06:46 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "Soft errors are an important challenge in contemporary
microprocessors. Modern processors have caches and
large memory arrays protected by parity or error
detection and correction codes. However, today's
failure rate is dominated by flip flops, latches, and
the increasing sensitivity of combinational logic to
particle strikes. Moreover, as Chip Multi-Processors
(CMPs) become ubiquitous, meeting the FIT budget for
new designs is becoming a major
Solutions based on replicating threads have been
explored deeply; however, their high cost in
performance and energy make them unsuitable for current
designs. Moreover, our studies based on a typical
configuration for a modern processor show that focusing
on the top 5 most vulnerable structures can provide up
to 70\% reduction in FIT rate. Therefore, full
replication may overprotect the chip by reducing the
FIT much below budget.\par
We propose {\em Selective Replication}, a
lightweight-reconfigurable mechanism that achieves a
high FIT reduction by protecting the most vulnerable
instructions with minimal performance and energy
impact. Low performance degradation is achieved by not
requiring additional issue slots and reissuing
instructions only during the time window between when
they are retirable and they actually retire. Coverage
can be reconfigured online by replicating only a subset
of the instructions (the most vulnerable ones).
Instructions' vulnerability is estimated based on the
area they occupy and the time they spend in the issue
queue. By changing the vulnerability threshold, we can
adjust the trade-off between coverage and performance
Results for an out-of-order processor configured
similarly to Intel{\reg} Core\TM{} Micro-Architecture
show that our scheme can achieve over 65\% FIT
reduction with less than 4\% performance degradation
with small area and complexity overhead.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
keywords = "AVF prediction; FIT reduction; redundant
multithreading; Soft errors",
author = "Yin Wang and St{\'e}phane Lafortune and Terence Kelly
and Manjunath Kudlur and Scott Mahlke",
title = "The theory of deadlock avoidance via discrete
journal = j-SIGPLAN,
volume = "44",
number = "1",
pages = "252--263",
month = jan,
year = "2009",
DOI = "https://doi.org/10.1145/1480881.1480913",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 9 08:40:38 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Deadlock in multithreaded programs is an increasingly
important problem as ubiquitous multicore architectures
force parallelization upon an ever wider range of
software. This paper presents a theoretical foundation
for dynamic deadlock avoidance in concurrent programs
that employ conventional mutual exclusion and
synchronization primitives (e.g., multithreaded
C/Pthreads programs). Beginning with control flow
graphs extracted from program source code, we construct
a formal model of the program and then apply Discrete
Control Theory to automatically synthesize
deadlock-avoidance control logic that is implemented by
program instrumentation. At run time, the control logic
avoids deadlocks by postponing lock acquisitions.
Discrete Control Theory guarantees that the program
instrumented with our synthesized control logic cannot
deadlock. Our method furthermore guarantees that the
control logic is maximally permissive: it postpones
lock acquisitions only when necessary to prevent
deadlocks, and therefore permits maximal runtime
concurrency. Our prototype for C/Pthreads scales to
real software including Apache, OpenLDAP, and two kinds
of benchmarks, automatically avoiding both injected and
naturally occurring deadlocks while imposing modest
runtime overheads.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrent programming; discrete control theory;
dynamic deadlock avoidance; multicore processors;
multithreaded programming; parallel programming",
author = "Lamia Youseff and Keith Seymour and Haihang You and
Dmitrii Zagorodnov and Jack Dongarra and Rich Wolski",
title = "Paravirtualization effect on single-and multi-threaded
memory-intensive linear algebra software",
journal = "The Journal of Networks, Software Tools, and Cluster
volume = "12",
number = "2",
pages = "101--122",
month = "????",
year = "2009",
DOI = "https://doi.org/10.1007/s10586-009-0080-4",
ISSN = "1386-7857",
bibdate = "Tue Jun 4 08:20:03 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
acknowledgement = ack-nhfb,
author = "Jie Yu and Satish Narayanasamy",
title = "A case for an interleaving constrained shared-memory
journal = j-COMP-ARCH-NEWS,
volume = "37",
number = "3",
pages = "325--336",
month = jun,
year = "2009",
DOI = "https://doi.org/10.1145/1555815.1555796",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Aug 11 18:12:55 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Shared-memory multi-threaded programming is inherently
more difficult than single-threaded programming. The
main source of complexity is that, the threads of an
application can interleave in so many different ways.
To ensure correctness, a programmer has to test all
possible thread interleavings, which, however, is
Many rare thread interleavings remain untested in
production systems, and they are the root cause for a
majority of concurrency bugs. We propose a
shared-memory multi-processor design that avoids
untested interleavings to improve the correctness of a
multi-threaded program. Since untested interleavings
tend to occur infrequently at runtime, the performance
cost of avoiding them is not high.\par
We propose to encode the set of tested correct
interleavings in a program's binary executable using
{\em Predecessor Set (PSet)\/} constraints. These
constraints are efficiently enforced at runtime using
processor support, which ensures that the runtime
follows a tested interleaving. We analyze several bugs
in open source applications such as MySQL, Apache,
Mozilla, etc., and show that, by enforcing PSet
constraints, we can avoid not only data races and
atomicity violations, but also other forms of
concurrency bugs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "concurrency bugs; multiprocessors; parallel
programming; software reliability",
author = "Lukasz Ziarek and Suresh Jagannathan and Matthew Fluet
and Umut A. Acar",
title = "Speculative {$N$}-way barriers (abstract only)",
journal = j-SIGPLAN,
volume = "44",
number = "5",
pages = "8--8",
month = may,
year = "2009",
DOI = "https://doi.org/10.1145/1629635.1629637",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jun 21 18:01:41 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Speculative execution is an important technique that
has historically been used to extract concurrency from
sequential programs. While techniques to support
speculation work well when computations perform
relatively simple actions (e.g., reads and writes to
known locations), understanding speculation for
multi-threaded programs in which threads may
communicate and synchronize through multiple shared
references is significantly more challenging, and is
the focus of this paper.\par
We use as our reference point a simple higher-order
concurrent language extended with an n-way barrier and
a fork/join execution model. Our technique permits the
expression guarded by the barrier to speculatively
proceed before the barrier has been satisfied (i.e.,
before all threads that synchronize on that barrier
have done so) and to have participating threads that
would normally block on the barrier to speculatively
proceed as well. Our solution formulates safety
properties under which speculation is correct in a
fork/join model, and per-synchronization basis.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "R. Agarwal and S. Bensalem and E. Farchi and K.
Havelund and Y. Nir-Buchbinder and S. Stoller and S. Ur
and L. Wang",
title = "Detection of deadlock potentials in multithreaded
journal = j-IBM-JRD,
volume = "54",
number = "5",
pages = "3:1--3:15",
month = "????",
year = "2010",
DOI = "https://doi.org/10.1147/JRD.2010.2060276",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Sun Feb 20 14:29:19 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
author = "Kunal Agrawal and Charles E. Leiserson and Jim Sukha",
title = "Helper locks for fork-join parallel programming",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "245--256",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693487",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Helper locks allow programs with large parallel
critical sections, called parallel regions, to execute
more efficiently by enlisting processors that might
otherwise be waiting on the helper lock to aid in the
execution of the parallel region. Suppose that a
processor {\em p\/} is executing a parallel region {\em
A\/} after having acquired the lock {\em L\/}
protecting {\em A}. If another processor {\em p\/} $
\prime $ tries to acquire {\em L}, then instead of
blocking and waiting for {\em p\/} to complete {\em A},
processor {\em p\/} $ \prime $ joins {\em p\/} to help
it complete {\em A}. Additional processors not blocked
on {\em L\/} may also help to execute {\em A}.\par The
HELPER runtime system can execute fork-join
computations augmented with helper locks and parallel
regions. HELPER supports the unbounded nesting of
parallel regions. We provide theoretical
completion-time and space-usage bounds for a design of
HELPER based on work stealing. Specifically, let {\em
V\/} be the number of parallel regions in a
computation, let {\em T\/}$_1$ be its work, and let
{\em T\/} $ \infty $ be its 'aggregate span' --- the
sum of the spans (critical-path lengths) of all its
parallel regions. We prove that HELPER completes the
computation in expected time {\em O\/} ({\em T\/}$_1$ /
{\em P\/} P + {\em T\/} $ \infty $ + {\em PV\/}) on
{\em P\/} processors. This bound indicates that
programs with a small number of highly parallel
critical sections can attain linear speedup. For the
space bound, we prove that HELPER completes a program
using only $O(P S_1)$ stack space, where $S_1$ is the
sum, over all regions, of the stack space used by each
region in a serial execution. Finally, we describe a
prototype of HELPER implemented by modifying the Cilk
multithreaded runtime system. We used this prototype to
implement a concurrent hash table with a resize
operation protected by a helper lock.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "Cilk; dynamic multithreading; helper lock; nested
parallelism; parallel region; scheduling; work
author = "Pavan Balaji and Darius Buntinas and David Goodell and
William Gropp and Rajeev Thakur",
title = "Fine-Grained Multithreading Support for Hybrid
Threaded {MPI} Programming",
journal = j-IJHPCA,
volume = "24",
number = "1",
pages = "49--57",
month = feb,
year = "2010",
DOI = "https://doi.org/10.1177/1094342009360206",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Tue Aug 31 09:59:45 MDT 2010",
bibsource = "http://hpc.sagepub.com/content/24/1.toc;
URL = "http://hpc.sagepub.com/content/24/1/49.full.pdf+html",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Performance Computing
journal-URL = "http://hpc.sagepub.com/content/by/year",
author = "Gilles Barthe and Tamara Rezk and Alejandro Russo and
Andrei Sabelfeld",
title = "Security of multithreaded programs by compilation",
journal = j-TISSEC,
volume = "13",
number = "3",
pages = "21:1--21:??",
month = jul,
year = "2010",
DOI = "https://doi.org/10.1145/1805974.1895977",
ISSN = "1094-9224 (print), 1557-7406 (electronic)",
ISSN-L = "1094-9224",
bibdate = "Wed Jul 28 14:57:15 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "End-to-End security of mobile code requires that the
code neither intentionally nor accidentally propagates
sensitive information to an adversary. Although mobile
code is commonly multithreaded low-level code, there
lack enforcement mechanisms that ensure information
security for such programs. The modularity is
three-fold: we give modular extensions of sequential
semantics, sequential security typing, and sequential
security-type preserving compilation that allow us
enforcing security for multithreaded programs. Thanks
to the modularity, there are no more restrictions on
multithreaded source programs than on sequential ones,
and yet we guarantee that their compilations are
provably secure for a wide class of schedulers.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Information and System Security",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J789",
keywords = "compilers; Noninterference; schedulers; type systems",
author = "Tom Bergan and Owen Anderson and Joseph Devietti and
Luis Ceze and Dan Grossman",
title = "{CoreDet}: a compiler and runtime system for
deterministic multithreaded execution",
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "1",
pages = "53--64",
month = mar,
year = "2010",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Mar 17 14:42:04 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Tom Bergan and Owen Anderson and Joseph Devietti and
Luis Ceze and Dan Grossman",
title = "{CoreDet}: a compiler and runtime system for
deterministic multithreaded execution",
journal = j-SIGPLAN,
volume = "45",
number = "3",
pages = "53--64",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1145/1736020.1736029",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Mar 17 13:46:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The behavior of a multithreaded program does not
depend only on its inputs. Scheduling, memory
reordering, timing, and low-level hardware effects all
introduce nondeterminism in the execution of
multithreaded programs. This severely complicates many
tasks, including debugging, testing, and automatic
replication. In this work, we avoid these complications
by eliminating their root cause: we develop a compiler
and runtime system that runs arbitrary multithreaded
C/C++ POSIX Threads programs deterministically.\par
A trivial nonperformant approach to providing
determinism is simply deterministically serializing
execution. Instead, we present a compiler and runtime
infrastructure that ensures determinism but resorts to
serialization rarely, for handling interthread
communication and synchronization. We develop two basic
approaches, both of which are largely dynamic with
performance improved by some static compiler
optimizations. First, an ownership-based approach
detects interthread communication via an evolving table
that tracks ownership of memory regions by threads.
Second, a buffering approach uses versioned memory and
employs a deterministic commit protocol to make changes
visible to other threads. While buffering has larger
single-threaded overhead than ownership, it tends to
scale better (serializing less often). A hybrid system
sometimes performs and scales better than either
approach individually.\par
Our implementation is based on the LLVM compiler
infrastructure. It needs neither programmer annotations
nor special hardware. Our empirical evaluation uses the
PARSEC and SPLASH2 benchmarks and shows that our
approach scales comparably to nondeterministic
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "compilers; determinism; multicore; multithreading",
author = "Shahid Bokhari and Joel Saltz",
title = "Exploring the performance of massively multithreaded
journal = j-CCPE,
volume = "22",
number = "5",
pages = "588--616",
day = "10",
month = apr,
year = "2010",
DOI = "https://doi.org/10.1002/cpe.1484",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:42 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "1 Sep 2009",
author = "Nathan G. Bronson and Jared Casper and Hassan Chafi
and Kunle Olukotun",
title = "A practical concurrent binary search tree",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "257--268",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693488",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We propose a concurrent relaxed balance AVL tree
algorithm that is fast, scales well, and tolerates
contention. It is based on optimistic techniques
adapted from software transactional memory, but takes
advantage of specific knowledge of the algorithm to
reduce overheads and avoid unnecessary retries. We
extend our algorithm with a fast linearizable clone
operation, which can be used for consistent iteration
of the tree. Experimental evidence shows that our
algorithm outperforms a highly tuned concurrent skip
list for many access patterns, with an average of 39\%
higher single-threaded throughput and 32\% higher
multi-threaded throughput over a range of contention
levels and operation mixes.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "optimistic concurrency; snapshot isolation",
author = "Jacob Burnim and Koushik Sen",
title = "Asserting and checking determinism for multithreaded
journal = j-CACM,
volume = "53",
number = "6",
pages = "97--105",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1743546.1743572",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Mon Jun 21 12:34:55 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/cacm/;
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "Changno Chen and Marc Moreno Maza and Yuzhen Xie",
title = "Cache complexity and multicore implementation for
univariate real root isolation",
volume = "44",
number = "3",
pages = "97--98",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1940475.1940483",
ISSN = "1932-2232 (print), 1932-2240 (electronic)",
ISSN-L = "1932-2232",
bibdate = "Thu Mar 31 10:24:16 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Isolating the real roots of a univariate polynomial is
a driving subject in computer algebra. This problem has
been studied under various angles from algebraic
algorithms [1, 2, 7] to implementation techniques [3,
5]. Today, multicores are the most popular parallel
hardware architectures. Beside, understanding the
implications of hierarchical memory on performance
software engineering has become essential. These
observations motivate our study. We analyze the cache
complexity of the core routine of many real root
isolation algorithms namely, the Taylor shift. Then, we
present efficient multithreaded implementation on
acknowledgement = ack-nhfb,
fjournal = "ACM Communications in Computer Algebra",
issue = "173",
author = "M. Chetlur and U. Devi and P. Dutta and P. Gupta and
L. Chen and Z. Zhu and S. Kalyanaraman and Y. Lin",
title = "A software {WiMAX} medium access control layer using
massively multithreaded processors",
journal = j-IBM-JRD,
volume = "54",
number = "1",
pages = "??--??",
month = "????",
year = "2010",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Sat May 1 17:44:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.research.ibm.com/journal/abstracts/rd/541/chetlur-dutta.html",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
author = "Jee W. Choi and Amik Singh and Richard W. Vuduc",
title = "Model-driven autotuning of sparse matrix-vector
multiply on {GPUs}",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "115--126",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693471",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present a performance model-driven framework for
automated performance tuning (autotuning) of sparse
matrix-vector multiply (SpMV) on systems accelerated by
graphics processing units (GPU). Our study consists of
two parts.\par
First, we describe several carefully hand-tuned SpMV
implementations for GPUs, identifying key GPU-specific
performance limitations, enhancements, and tuning
opportunities. These implementations, which include
variants on classical blocked compressed sparse row
(BCSR) and blocked ELLPACK (BELLPACK) storage formats,
match or exceed state-of-the-art implementations. For
instance, our best BELLPACK implementation achieves up
to 29.0 Gflop/s in single-precision and 15.7 Gflop/s in
double-precision on the NVIDIA T10P multiprocessor
(C1060), enhancing prior state-of-the-art unblocked
implementations (Bell and Garland, 2009) by up to
1.8\times and 1.5\times for single-and double-precision
However, achieving this level of performance requires
input matrix-dependent parameter tuning. Thus, in the
second part of this study, we develop a performance
model that can guide tuning. Like prior autotuning
models for CPUs (e.g., Im, Yelick, and Vuduc, 2004),
this model requires offline measurements and run-time
estimation, but more directly models the structure of
multithreaded vector processors like GPUs. We show that
our model can identify the implementations that achieve
within 15\% of those found through exhaustive search.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "GPU; performance modeling; sparse matrix-vector
author = "Katherine E. Coons and Sebastian Burckhardt and
Madanlal Musuvathi",
title = "{GAMBIT}: effective unit testing for concurrency
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "15--24",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1837853.1693458",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As concurrent programming becomes prevalent, software
providers are investing in concurrency libraries to
improve programmer productivity. Concurrency libraries
improve productivity by hiding error-prone, low-level
synchronization from programmers and providing
higher-level concurrent abstractions. Testing such
libraries is difficult, however, because concurrency
failures often manifest only under particular
scheduling circumstances. Current best testing
practices are often inadequate: heuristic-guided
fuzzing is not systematic, systematic schedule
enumeration does not find bugs quickly, and stress
testing is neither systematic nor fast.\par
To address these shortcomings, we propose a prioritized
search technique called GAMBIT that combines the speed
benefits of heuristic-guided fuzzing with the
soundness, progress, and reproducibility guarantees of
stateless model checking. GAMBIT combines known
techniques such as partial-order reduction and
preemption-bounding with a generalized best-first
search frame- work that prioritizes schedules likely to
expose bugs. We evaluate GAMBIT's effectiveness on
newly released concurrency libraries for Microsoft's
.NET framework. Our experiments show that GAMBIT finds
bugs more quickly than prior stateless model checking
techniques without compromising coverage guarantees or
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; model checking; multithreading;
partial-order reduction; preemption bound; software
author = "Mads Dam and Bart Jacobs and Andreas Lundblad and
Frank Piessens",
title = "Provably correct inline monitoring for multithreaded
{Java}-like programs",
journal = j-J-COMP-SECUR,
volume = "18",
number = "1",
pages = "37--59",
month = "????",
year = "2010",
DOI = "https://doi.org/10.3233/JCS-2010-0365",
ISSN = "0926-227X (print), 1875-8924 (electronic)",
ISSN-L = "0926-227X",
bibdate = "Tue May 24 06:24:34 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computer Security",
journal-URL = "http://content.iospress.com/journals/journal-of-computer-security",
author = "Jason Jianxun Ding and Abdul Waheed and Jingnan Yao
and Laxmi N. Bhuyan",
title = "Performance characterization of multi-thread and
multi-core processors based {XML} application oriented
networking systems",
journal = j-J-PAR-DIST-COMP,
volume = "70",
number = "5",
pages = "584--597",
month = may,
year = "2010",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Wed Sep 1 16:27:28 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada
and Tomonari Masada and Kiyoshi Oguri and Duncan A.
title = "Implementation of a programming environment with a
multithread model for reconfigurable systems",
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "4",
pages = "40--45",
month = sep,
year = "2010",
DOI = "https://doi.org/10.1145/1926367.1926375",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Jan 20 14:27:03 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Reconfigurable systems are known to be able to achieve
higher performance than traditional microprocessor
architecture for many application fields. However, in
order to extract a full potential of the reconfigurable
systems, programmers often have to design and describe
the best suited code for their target architecture with
specialized knowledge. The aim of this paper is to
assist the users of reconfigurable systems by
implementing a translator with a multithread model. The
experimental results show our translator automatically
generates efficient performance-aware code segments
including DMA transfer and shift registers for memory
access optimization.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Susan Eggers",
title = "{2010 Athena} lecture",
journal = j-SIGPLAN,
volume = "45",
number = "6",
pages = "98--98",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1809028.1806608",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:53:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Susan Eggers, a Professor of Computer Science and
Engineering at the University of Washington, joined her
department in 1989. She received a B.A. in 1965 from
Connecticut College and a Ph.D. in 1989 from the
University of California, Berkeley. Her research
interests are in computer architecture and back-end
compiler optimization, with an emphasis on experimental
performance analysis. With her colleague Hank Levy and
their students, she developed the first commercially
viable multithreaded architecture, Simultaneous
Multithreading, adopted by Intel (as Hyperthreading),
IBM, Sun and others. Her current research is in the
areas of distributed dataflow machines, FPGAs and chip
multiprocessors. In 1989 Professor Eggers was awarded
an IBM Faculty Development Award, in 1990 an NSF
Presidential Young Investigator Award, in 1994 the
Microsoft Professorship in Computer Science and
Engineering, and in 2009 the ACM-W Athena Lecturer. She
is a Fellow of the ACM and IEEE, a Fellow of the AAAS,
and a member of the National Academy of Engineering.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "invited talk",
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Probabilistic job symbiosis modeling for {SMT}
processor scheduling",
journal = j-SIGPLAN,
volume = "45",
number = "3",
pages = "91--102",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1145/1736020.1736033",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Mar 17 13:46:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Symbiotic job scheduling boosts simultaneous
multithreading (SMT) processor performance by
co-scheduling jobs that have `compatible' demands on
the processor's shared resources. Existing approaches
however require a sampling phase, evaluate a limited
number of possible co-schedules, use heuristics to
gauge symbiosis, are rigid in their optimization
target, and do not preserve system-level
This paper proposes probabilistic job symbiosis
modeling, which predicts whether jobs will create
positive or negative symbiosis when co-scheduled
without requiring the co-schedule to be evaluated. The
model, which uses per-thread cycle stacks computed
through a previously proposed cycle accounting
architecture, is simple enough to be used in system
software. Probabilistic job symbiosis modeling provides
six key innovations over prior work in symbiotic job
scheduling: (i) it does not require a sampling phase,
(ii) it readjusts the job co-schedule continuously,
(iii) it evaluates a large number of possible
co-schedules at very low overhead, (iv) it is not
driven by heuristics, (v) it can optimize a performance
target of interest (e.g., system throughput or job
turnaround time), and (vi) it preserves system-level
priorities/shares. These innovations make symbiotic job
scheduling both practical and effective.\par
Our experimental evaluation, which assumes a realistic
scenario in which jobs come and go, reports an average
16\% (and up to 35\%) reduction in job turnaround time
compared to the previously proposed SOS (sample,
optimize, symbios) approach for a two-thread SMT
processor, and an average 19\% (and up to 45\%)
reduction in job turnaround time for a four-thread SMT
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "performance modeling; simultaneous multi-threading
(SMT); symbiotic job scheduling",
author = "Cormac Flanagan and Stephen N. Freund",
title = "Adversarial memory for detecting destructive races",
journal = j-SIGPLAN,
volume = "45",
number = "6",
pages = "244--254",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1806596.1806625",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:53:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "Multithreaded programs are notoriously prone to race
conditions, a problem exacerbated by the widespread
adoption of multi-core processors with complex memory
models and cache coherence protocols. Much prior work
has focused on static and dynamic analyses for race
detection, but these algorithms typically are unable to
distinguish destructive races that cause erroneous
behavior from benign races that do not. Performing this
classification manually is difficult, time consuming,
and error prone.\par
This paper presents a new dynamic analysis technique
that uses {\em adversarial memory\/} to classify race
conditions as destructive or benign on systems with
relaxed memory models. Unlike a typical language
implementation, which may only infrequently exhibit
non-sequentially consistent behavior, our adversarial
memory implementation exploits the full freedom of the
memory model to return older, unexpected, or stale
values for memory reads whenever possible, in an
attempt to crash the target program (that is, to force
the program to behave erroneously). A crashing
execution provides concrete evidence of a destructive
bug, and this bug can be strongly correlated with a
specific race condition in the target
Experimental results with our Jumble prototype for Java
demonstrate that adversarial memory is highly effective
at identifying destructive race conditions, and in
distinguishing them from race conditions that are real
but benign. Adversarial memory can also reveal
destructive races that would not be detected by
traditional testing (even after thousands of runs) or
by model checkers that assume sequential consistency.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "concurrency; dynamic analysis; race conditions;
relaxed memory models",
author = "Dan Gibson and David A. Wood",
title = "{Forwardflow}: a scalable core for power-constrained
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "3",
pages = "14--25",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1816038.1815966",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jul 6 14:11:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Chip Multiprocessors (CMPs) are now commodity
hardware, but commoditization of parallel software
remains elusive. In the near term, the current trend of
increased core-per-socket count will continue, despite
a lack of parallel software to exercise the hardware.
Future CMPs must deliver thread-level parallelism when
software provides threads to run, but must also
continue to deliver performance gains for single
threads by exploiting instruction-level parallelism and
memory-level parallelism. However, power limitations
will prevent conventional cores from exploiting both
This work presents the Forwardflow Architecture, which
can scale its execution logic up to run single threads,
or down to run multiple threads in a CMP. Forwardflow
dynamically builds an explicit internal dataflow
representation from a conventional instruction set
architecture, using forward dependence pointers to
guide instruction wakeup, selection, and issue.
Forwardflow's backend is organized into discrete units
that can be individually (de-)activated, allowing each
core's performance to be scaled by system software at
the architectural level.\par
On single threads, Forwardflow core scaling yields a
mean runtime reduction of 21\% for a 37\% increase in
power consumption. For multithreaded workloads, a
Forwardflow-based CMP allows system software to select
the performance point that best matches available
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "chip multiprocessor (cmp); power; scalable core",
author = "M. Gupta and F. Sanchez and J. Llosa",
title = "{CSMT}: Simultaneous Multithreading for Clustered
{VLIW} Processors",
journal = j-IEEE-TRANS-COMPUT,
volume = "59",
number = "3",
pages = "385--399",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1109/TC.2009.96",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Sun Jul 3 11:52:26 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5161255",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Andrew Hilton and Amir Roth",
title = "{SMT-Directory}: Efficient Load-Load Ordering for
volume = "9",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Memory models like SC, TSO, and PC enforce load-load
ordering, requiring that loads from any single thread
appear to occur in program order to all other threads.
Out-of-order execution can violate load-load ordering.
Conventional multi-processors with out-of-order cores
detect load-load ordering violations by snooping an
age-ordered load queue on cache invalidations or
evictions-events that act as proxies for the completion
of remote stores. This mechanism becomes less efficient
in an SMT processor, as every completing store must
search the loads queue segments of all other threads.
This inefficiency exists because store completions from
other threads in the same core are not filtered by the
cache and coherence protocol: thread 0 observes all of
thread 1's stores, not only the first store to every
cache line. SMT-Directory eliminates this overhead by
implementing the filtering traditionally provided by
the cache in the cache itself. SMT-Directory adds a
per-thread ``{read''} bit to every data cache line.
When a load executes, it sets the bit corresponding to
its thread. When a store completes and write to the
cache, it checks the SMT-Directory bits of its cache
line and searches the load queue segments only of those
threads whose bits are set. As a result, local store
completions trigger searches only for data that is
actually shared.",
acknowledgement = ack-nhfb,
affiliation = "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
Philadelphia, PA 19104 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0541292]",
funding-text = "We thank Arun Raghavan for the address traces and Milo
Martin for comments on early versions of this work. The
anonymous reviewers provided valuable feedback. This
work was supported by NSF award CCF-0541292.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "consistency models; directory; load queue search;
load-load ordering; Simultaneous multithreading",
keywords-plus = "CONSISTENCY",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hilton:2010:SDE",
web-of-science-categories = "Computer Science, Hardware \&
author = "Ramesh Illikkal and Vineet Chadha and Andrew Herdrich
and Ravi Iyer and Donald Newell",
title = "{PIRATE}: {QoS} and performance management in {CMP}
journal = j-SIGMETRICS,
volume = "37",
number = "4",
pages = "3--10",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773394.1773396",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Wed Aug 25 07:35:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As new multi-threaded usage models such as
virtualization and consolidation take advantage of
multiple cores in CMP architectures, the impact of
shared resource contention between VMs and user-level
applications introduces Quality of Service(QoS)
concerns and challenges. QoS-aware management of these
shared platform resources is therefore becoming
increasingly important. Various QoS schemes for
resource management have been recently proposed, but
most of these prior efforts have been focused on
controlling individual resource allocation based on
priority information passed down from the OS or
Hypervisor to system resources. The complexity of this
approach increases when multiple levels of resources
are associated with an application's performance and
power consumption. In this paper we employ simpler
rate-based QoS mechanisms which control the execution
rate of competing applications. To enable
differentiation between simultaneously running
applications' performance and power consumption, these
rate mechanisms need to dynamically adjust the
execution of application. Our proposed PI-RATE
architecture introduces a control-theoretic approach to
dynamically adjust the execution rate of each
application based on the QoS target and monitored
resource utilization. We evaluate three modes of
PI-RATE architecture --- cache QoS targets, performance
QoS targets and power QoS targets --- to show that the
PI-RATE architecture is flexible and effective at
enabling QoS in a CMP platform.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "clock modulation; frequency scaling; integral
controller; proportional",
author = "Byunghyun Jang and Perhaad Mistry and Dana Schaa and
Rodrigo Dominguez and David Kaeli",
title = "Data transformations enabling loop vectorization on
multithreaded data parallel architectures",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "353--354",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1837853.1693510",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Loop vectorization, a key feature exploited to obtain
high performance on Single Instruction Multiple Data
(SIMD) vector architectures, is significantly hindered
by irregular memory access patterns in the data stream.
This paper describes data transformations that allow us
to vectorize loops targeting massively multithreaded
data parallel architectures. We present a mathematical
model that captures loop-based memory access patterns
and computes the most appropriate data transformations
in order to enable vectorization. Our experimental
results show that the proposed data transformations can
significantly increase the number of loops that can be
vectorized and enhance the data-level parallelism of
applications. Our results also show that the overhead
associated with our data transformations can be easily
amortized as the size of the input data set increases.
For the set of high performance benchmark kernels
studied, we achieve consistent and significant
performance improvements (up to 11.4X) by applying
vectorization using our data transformation approach.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "data transformation; GPGPU; loop vectorization",
author = "Oren Laadan and Nicolas Viennot and Jason Nieh",
title = "Transparent, lightweight application execution replay
on commodity multiprocessor operating systems",
journal = j-SIGMETRICS,
volume = "38",
number = "1",
pages = "155--166",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1811039.1811057",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Wed Aug 25 07:35:52 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present Scribe, the first system to provide
transparent, low-overhead application record-replay and
the ability to go live from replayed execution. Scribe
introduces new lightweight operating system mechanisms,
rendezvous and sync points, to efficiently record
nondeterministic interactions such as related system
calls, signals, and shared memory accesses. Rendezvous
points make a partial ordering of execution based on
system call dependencies sufficient for replay,
avoiding the recording overhead of maintaining an exact
execution ordering. Sync points convert asynchronous
interactions that can occur at arbitrary times into
synchronous events that are much easier to record and
We have implemented Scribe without changing, relinking,
or recompiling applications, libraries, or operating
system kernels, and without any specialized hardware
support such as hardware performance counters. It works
on commodity Linux operating systems, and commodity
multi-core and multiprocessor hardware. Our results
show for the first time that an operating system
mechanism can correctly and transparently record and
replay multi-process and multi-threaded applications on
commodity multiprocessors. Scribe recording overhead is
less than 2.5\% for server applications including
Apache and MySQL, and less than 15\% for desktop
applications including Firefox, Acrobat, OpenOffice,
parallel kernel compilation, and movie playback.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "debugging; fault-tolerance; record-replay;
author = "Dongyoon Lee and Benjamin Wester and Kaushik
Veeraraghavan and Satish Narayanasamy and Peter M. Chen
and Jason Flinn",
title = "{Respec}: efficient online multiprocessor replay via
speculation and external determinism",
journal = j-SIGPLAN,
volume = "45",
number = "3",
pages = "77--90",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1145/1736020.1736031",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Mar 17 13:46:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Deterministic replay systems record and reproduce the
execution of a hardware or software system. While it is
well known how to replay uniprocessor systems,
replaying shared memory multiprocessor systems at low
overhead on commodity hardware is still an open
problem. This paper presents Respec, a new way to
support deterministic replay of shared memory
multithreaded programs on commodity multiprocessor
hardware. Respec targets online replay in which the
recorded and replayed processes execute
Respec uses two strategies to reduce overhead while
still ensuring correctness: speculative logging and
externally deterministic replay. Speculative logging
optimistically logs less information about shared
memory dependencies than is needed to guarantee
deterministic replay, then recovers and retries if the
replayed process diverges from the recorded process.
Externally deterministic replay relaxes the degree to
which the two executions must match by requiring only
their system output and final program states match. We
show that the combination of these two techniques
results in low recording and replay overhead for the
common case of data-race-free execution intervals and
still ensures correct replay for execution intervals
that have data races.\par
We modified the Linux kernel to implement our
techniques. Our software system adds on average about
18\% overhead to the execution time for recording and
replaying programs with two threads and 55\% overhead
for programs with four threads.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "external determinism; replay; speculative execution",
author = "Yi-Neng Lin and Ying-Dar Lin and Yuan-Cheng Lai",
title = "Thread allocation in {CMP}-based multithreaded network
volume = "36",
number = "2--3",
pages = "104--116",
month = feb # "\slash " # mar,
year = "2010",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Thu Sep 2 17:51:12 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Sandya Mannarswamy and Dhruva R. Chakrabarti and
Kaushik Rajan and Sujoy Saraswati",
title = "Compiler aided selective lock assignment for improving
the performance of software transactional memory",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "37--46",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693460",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Atomic sections have been recently introduced as a
language construct to improve the programmability of
concurrent software. They simplify programming by not
requiring the explicit specification of locks for
shared data. Typically atomic sections are supported in
software either through the use of optimistic
concurrency by using transactional memory or through
the use of pessimistic concurrency using
compiler-assigned locks. As a software transactional
memory (STM) system does not take advantage of the
specific memory access patterns of an application it
often suffers from false conflicts and high validation
overheads. On the other hand, the compiler usually ends
up assigning coarse grain locks as it relies on whole
program points-to analysis which is conservative by
nature. This adversely affects performance by limiting
concurrency. In order to mitigate the disadvantages
associated with STM's lock assignment scheme, we
propose a hybrid approach which combines STM's lock
assignment with a compiler aided selective lock
assignment scheme (referred to as SCLA-STM). SCLA-STM
overcomes the inefficiencies associated with a purely
compile-time lock assignment approach by (i) using the
underlying STM for shared variables where only a
conservative analysis is possible by the compiler
(e.g., in the presence of may-alias points to
information) and (ii) being selective about the shared
data chosen for the compiler-aided lock assignment. We
describe our prototype SCLA-STM scheme implemented in
the HP-UX IA-64 C/C++ compiler, using TL2 as our STM
implementation. We show that SCLA-STM improves
application performance for certain STAMP benchmarks
from 1.68\% to 37.13\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "compilers; multithreading; parallelization;
author = "Daniel Marino and Abhayendra Singh and Todd Millstein
and Madanlal Musuvathi and Satish Narayanasamy",
title = "{DRFX}: a simple and efficient memory model for
concurrent programming languages",
journal = j-SIGPLAN,
volume = "45",
number = "6",
pages = "351--362",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1806596.1806636",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:53:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The most intuitive memory model for shared-memory
multithreaded programming is {\em sequential
consistency\/} (SC), but it disallows the use of many
compiler and hardware optimizations thereby impacting
performance. Data-race-free (DRF) models, such as the
proposed C++0x memory model, guarantee SC execution for
datarace-free programs. But these models provide no
guarantee at all for racy programs, compromising the
safety and debuggability of such programs. To address
the safety issue, the Java memory model, which is also
based on the DRF model, provides a weak semantics for
racy executions. However, this semantics is subtle and
complex, making it difficult for programmers to reason
about their programs and for compiler writers to ensure
the correctness of compiler optimizations.\par
We present the DRFx memory model, which is simple for
programmers to understand and use while still
supporting many common optimizations. We introduce a
{\em memory model (MM) exception\/} which can be
signaled to halt execution. If a program executes
without throwing this exception, then DRFx guarantees
that the execution is SC. If a program throws an MM
exception during an execution, then DRFx guarantees
that the program has a data race. We observe that SC
violations can be detected in hardware through a
lightweight form of conflict detection. Furthermore,
our model safely allows aggressive compiler and
hardware optimizations within compiler-designated
program regions. We formalize our memory model, prove
several properties about this model, describe a
compiler and hardware design suitable for DRFx, and
evaluate the performance overhead due to our compiler
and hardware requirements.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "data races; memory model exception; memory models;
sequential consistency; soft fences",
author = "Paul E. McKenney and Maged M. Michael and Josh
Triplett and Jonathan Walpole",
title = "Why the grass may not be greener on the other side: a
comparison of locking vs. transactional memory",
journal = j-OPER-SYS-REV,
volume = "44",
number = "3",
pages = "93--101",
month = jul,
year = "2010",
DOI = "https://doi.org/10.1145/1842733.1842749",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Thu Aug 19 14:21:54 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The advent of multi-core and multi-threaded processor
architectures highlights the need to address the
well-known shortcomings of the ubiquitous lock-based
synchronization mechanisms. To this end, transactional
memory has been viewed by many as a promising
alternative to locking. This paper therefore presents a
constructive critique of locking and transactional
memory: their strengths, weaknesses, and opportunities
for improvement.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Lingchuan Meng and Jeremy Johnson and Franz Franchetti
and Yevgen Voronenko and Marc Moreno Maza and Yuzhen
title = "Abstract only: {SPIRAL}-generated modular {FFTs}",
volume = "44",
number = "2",
pages = "25--26",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838599.1838616",
ISSN = "1932-2232 (print), 1932-2240 (electronic)",
ISSN-L = "1932-2232",
bibdate = "Mon Aug 2 13:47:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this poster we present the use of the SPIRAL system
(www.spiral.net) to generate code for modular Fast
Fourier Transforms (FFTs). SPIRAL is a library
generation system that automatically generates
platform-tuned implementations of digital signal
processing algorithms with an emphasis on fast
transforms. Currently, SPRIAL can generate highly
optimized fixed point and floating-point FFTs for a
variety of platforms including vectorization,
multi-threaded and distributed memory parallelization.
The code produced is competitive with the best
available code for these platforms and SPIRAL is used
by Intel for its IPP (Intel Performance Primitives) and
MKL (Math kernel Library) libraries.\par
The SPIRAL system uses a mathematical framework for
representing and deriving algorithms. Algorithms are
derived using rewrite rules and additional rules are
used to symbolically manipulate algorithms into forms
that take advantage of the underlying hardware. A
search engine with a feedback loop is used to tune
implementations to particular platforms. New transforms
are added by introducing new symbols and their
definition and new algorithms can be generated by
adding new rules.\par
We extended SPIRAL to generate algorithms for FFT
computation over finite fields. This addition required
adding a new data type, several new rules and a new
transform (ModDFT) definition. In addition, the
unparser (where code is generated) was extended so that
it can generate scalar and vectorized code for modular
arithmetic. With these enhancements, the SPRIAL
machinery can be applied to modular transforms that are
of interest to the computer algebra community. This
provides a framework for systematically optimizing
these transforms, utilizing vector and parallel
computation, and for automatically tuning them to
different platforms. In this poster we present
preliminary results from this exploration. We show that
the code generated by SPIRAL, with improved cache
locality and vectorization, is approximately ten times
faster than the modular FFT code in the modpn
acknowledgement = ack-nhfb,
fjournal = "ACM Communications in Computer Algebra",
issue = "172",
author = "Jiayuan Meng and David Tarjan and Kevin Skadron",
title = "Dynamic warp subdivision for integrated branch and
memory divergence tolerance",
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "3",
pages = "235--246",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1815961.1815992",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Tue Jul 6 14:11:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "SIMD organizations amortize the area and power of
fetch, decode, and issue logic across multiple
processing units in order to maximize throughput for a
given area and power budget. However, throughput is
reduced when a set of threads operating in lockstep (a
warp) are stalled due to long latency memory accesses.
The resulting idle cycles are extremely costly.
Multi-threading can hide latencies by interleaving the
execution of multiple warps, but deep multi-threading
using many warps dramatically increases the cost of the
register files (multi-threading depth $ \times $ SIMD
width), and cache contention can make performance
worse. Instead, intra-warp latency hiding should first
be exploited. This allows threads that are ready but
stalled by SIMD restrictions to use these idle cycles
and reduces the need for multi-threading among warps.
This paper introduces {\em dynamic warp subdivision\/}
(DWS), which allows a single warp to occupy more than
one slot in the scheduler without requiring extra
register file space. Independent scheduling entities
allow divergent branch paths to interleave their
execution, and allow threads that hit to run ahead. The
result is improved latency hiding and memory level
parallelism (MLP). We evaluate the technique on a
coherent cache hierarchy with private L1 caches and a
shared L2 cache. With an area overhead of less than
1\%, experiments with eight data-parallel benchmarks
show our technique improves performance on average by
1.7$ \times $.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
keywords = "branch divergence; cache; latency hiding; memory
divergence; SIMD; warp",
author = "Sai Prashanth Muralidhara and Mahmut Kandemir and
Padma Raghavan",
title = "Intra-application shared cache partitioning for
multithreaded applications",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "329--330",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1837853.1693498",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we address the problem of partitioning
a shared cache when the executing threads belong to the
same application.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "cache; multicore; parallel applications",
author = "Takuya Nakaike and Maged M. Michael",
title = "Lock elision for read-only critical sections in
journal = j-SIGPLAN,
volume = "45",
number = "6",
pages = "269--278",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1806596.1806627",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:53:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "It is not uncommon in parallel workloads to encounter
shared data structures with read-mostly access
patterns, where operations that update data are
infrequent and most operations are read-only.
Typically, data consistency is guaranteed using mutual
exclusion or read-write locks. The cost of atomic
update of lock variables result in high overheads and
high cache coherence traffic under active sharing, thus
slowing down single thread performance and limiting
In this paper, we present {\em SOLERO (Software
Optimistic Lock Elision for Read-Only critical
sections)}, a new lock implementation called for
optimizing read-only critical sections in Java based on
sequential locks. SOLERO is compatible with the
conventional lock implementation of Java. However,
unlike the conventional implementation, only critical
sections that may write data or have side effects need
to update lock variables, while read-only critical
sections need only read lock variables without writing
them. Each writing critical section changes the lock
value to a new value. Hence, a read-only critical
section is guaranteed to be consistent if the lock is
free and its value does not change from the beginning
to the end of the read-only critical section.\par
Using Java workloads including SPECjbb2005 and the
HashMap and TreeMap Java classes, we evaluate the
performance impact of applying SOLERO to read-mostly
locks. Our experimental results show performance
improvements across the board, often substantial, in
both single thread speed and scalability over the
conventional lock implementation (mutual exclusion) and
read-write locks. SOLERO improves the performance of
SPECjbb2005 by 3-5\% on single and multiple threads.
The results using the HashMap and TreeMap benchmarks
show that SOLERO outperforms the conventional lock
implementation and read-write locks by substantial
multiples on multi-threads.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "java; just-in-time compiler; lock; lock elision;
monitor; optimization; synchronization",
author = "Jung-Wook Park and Hoon-Mo Yang and Gi-Ho Park and
Shin-Dug Kim and Charles C. Weems",
title = "An instruction-systolic programmable shader
architecture for multi-threaded {$3$D} graphics
journal = j-J-PAR-DIST-COMP,
volume = "70",
number = "11",
pages = "1110--1118",
month = nov,
year = "2010",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Wed Sep 1 16:27:29 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Petar Radojkovi{\'c} and Vladimir {\v{C}}akarevi{\'c}
and Javier Verd{\'u} and Alex Pajuelo and Francisco J.
Cazorla and Mario Nemirovsky and Mateo Valero",
title = "Thread to strand binding of parallel network
applications in massive multi-threaded systems",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "191--202",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1837853.1693480",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In processors with several levels of hardware resource
sharing,like CMPs in which each core is an SMT, the
scheduling process becomes more complex than in
processors with a single level of resource sharing,
such as pure-SMT or pure-CMP processors. Once the
operating system selects the set of applications to
simultaneously schedule on the processor (workload),
each application/thread must be assigned to one of the
hardware contexts(strands). We call this last
scheduling step the Thread to Strand Binding or TSB. In
this paper, we show that the TSB impact on the
performance of processors with several levels of shared
resources is high. We measure a variation of up to 59\%
between different TSBs of real multithreaded network
applications running on the UltraSPARC T2 processor
which has three levels of resource sharing. In our
view, this problem is going to be more acute in future
multithreaded architectures comprising more cores, more
contexts per core, and more levels of resource
We propose a resource-sharing aware TSB algorithm
(TSBSched) that significantly facilitates the problem
of thread to strand binding for software-pipelined
applications, representative of multithreaded network
applications. Our systematic approach encapsulates
both, the characteristics of multithreaded processors
under the study and the structure of the software
pipelined applications. Once calibrated for a given
processor architecture, our proposal does not require
hardware knowledge on the side of the programmer, nor
extensive profiling of the application. We validate our
algorithm on the UltraSPARC T2 processor running a set
of real multithreaded network applications on which we
report improvements of up to 46\% compared to the
current state-of-the-art dynamic schedulers.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "CMT; process scheduling; simultaneous multithreading;
UltraSPARC T2",
author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
Magklis and P. Chaparro and A. Gonz{\'a}lez",
title = "Thread-management techniques to maximize efficiency in
multicore and simultaneous multithreaded
journal = j-TACO,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839671",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We provide an analysis of thread-management techniques
that increase performance or reduce energy in multicore
and Simultaneous Multithreaded (SMT) cores. Thread
delaying reduces energy consumption by running the core
containing the critical thread at maximum frequency
while scaling down the frequency and voltage of the
cores containing noncritical threads. In this article,
we provide an insightful breakdown of thread delaying
on a simulated multi-core microprocessor. Thread
balancing improves overall performance by giving higher
priority to the critical thread in the issue queue of
an SMT core. We provide a detailed breakdown of
performance results for thread-balancing, identifying
performance benefits and limitations. For those
benchmarks where a performance benefit is not possible,
we introduce a novel thread-balancing mechanism on an
SMT core that can reduce energy consumption. We have
performed a detailed study on an Intel microprocessor
simulator running parallel applications. Thread
delaying can reduce energy consumption by 4\% to 44\%
with negligible performance loss. Thread balancing can
increase performance by 20\% or can reduce energy
consumption by 23\%.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "critical threads; energy-aware; low-power; Meeting
point thread characterization; microarchitecture;
multi-threaded application; thread balancing; thread
author = "Arun Raman and Hanjun Kim and Thomas R. Mason and
Thomas B. Jablin and David I. August",
title = "Speculative parallelization using software
multi-threaded transactions",
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "1",
pages = "65--76",
month = mar,
year = "2010",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Mar 17 14:42:04 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Arun Raman and Hanjun Kim and Thomas R. Mason and
Thomas B. Jablin and David I. August",
title = "Speculative parallelization using software
multi-threaded transactions",
journal = j-SIGPLAN,
volume = "45",
number = "3",
pages = "65--76",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1145/1736020.1736030",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Mar 17 13:46:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "With the right techniques, multicore architectures may
be able to continue the exponential performance trend
that elevated the performance of applications of all
types for decades. While many scientific programs can
be parallelized without speculative techniques,
speculative parallelism appears to be the key to
continuing this trend for general-purpose applications.
Recently-proposed code parallelization techniques, such
as those by Bridges et al. and by Thies et al.,
demonstrate scalable performance on multiple cores by
using speculation to divide code into atomic units
(transactions) that span multiple threads in order to
expose data parallelism. Unfortunately, most software
and hardware Thread-Level Speculation (TLS) memory
systems and transactional memories are not sufficient
because they only support single-threaded atomic units.
Multi-threaded Transactions (MTXs) address this
problem, but they require expensive hardware support as
currently proposed in the literature. This paper
proposes a Software MTX (SMTX) system that captures the
{\em applicability\/} and {\em performance\/} of
hardware MTX, but on {\em existing multicore machines}.
The SMTX system yields a harmonic mean speedup of
13.36x on native hardware with four 6-core processors
(24 cores in total) running speculatively parallelized
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "automatic parallelization; loop-level parallelism;
multi-threaded transactions; pipelined parallelism;
software transactional memory; thread-level
author = "Layali Rashid and Wessam M. Hassanein and Moustafa A.
title = "Analyzing and enhancing the parallel sort operation on
multithreaded architectures",
volume = "53",
number = "2",
pages = "293--312",
month = aug,
year = "2010",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Wed Aug 25 08:39:00 MDT 2010",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=53&issue=2;
URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=53&issue=2&spage=293",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Daniel Sanchez and George Michelogiannakis and
Christos Kozyrakis",
title = "An analysis of on-chip interconnection networks for
large-scale chip multiprocessors",
journal = j-TACO,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1756065.1736069",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "With the number of cores of chip multiprocessors
(CMPs) rapidly growing as technology scales down,
connecting the different components of a CMP in a
scalable and efficient way becomes increasingly
challenging. In this article, we explore the
architectural-level implications of interconnection
network design for CMPs with up to 128 fine-grain
multithreaded cores. We evaluate and compare different
network topologies using accurate simulation of the
full chip, including the memory hierarchy and
interconnect, and using a diverse set of scientific and
engineering workloads.\par
We find that the interconnect has a large impact on
performance, as it is responsible for 60\% to 75\% of
the miss latency. Latency, and not bandwidth, is the
primary performance constraint, since, even with many
threads per core and workloads with high miss rates,
networks with enough bandwidth can be efficiently
implemented for the system scales we consider. From the
topologies we study, the flattened butterfly
consistently outperforms the mesh and fat tree on all
workloads, leading to performance advantages of up to
22\%. We also show that considering interconnect and
memory hierarchy together when designing large-scale
CMPs is crucial, and neglecting either of the two can
lead to incorrect conclusions. Finally, the effect of
the interconnect on overall performance becomes more
important as the number of cores increases, making
interconnection choices especially critical when
scaling up.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
keywords = "chip multiprocessors; hierarchical networks;
author = "Angela C. Sodan and Jacob Machina and Arash Deshmeh
and Kevin Macnaughton and Bryan Esbaugh",
title = "Parallelism via Multithreaded and Multicore {CPUs}",
journal = j-COMPUTER,
volume = "43",
number = "3",
pages = "24--32",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1109/MC.2010.75",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Wed May 12 22:57:42 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Niranjan Soundararajan and Anand Sivasubramaniam and
Vijay Narayanan",
title = "Characterizing the soft error vulnerability of
multicores running multithreaded applications",
journal = j-SIGMETRICS,
volume = "38",
number = "1",
pages = "379--380",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1811099.1811096",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Wed Aug 25 07:35:52 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multicores have become the platform of choice across
all market segments. Cost-effective protection against
soft errors is important in these environments, due to
the need to move to lower technology generations and
the exploding number of transistors on a chip. While
multicores offer the flexibility of varying the number
of application threads and the number of cores on which
they run, the reliability impact of choosing one
configuration over another is unclear. Our study
reveals that the reliability costs vary dramatically
between configurations and being unaware could lead to
a sub-optimal choice.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
keywords = "fit rate; multicore; soft errors",
author = "Dean F. Sutherland and William L. Scherlis",
title = "Composable thread coloring",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "233--244",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693485",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper introduces the language-independent concept
of ``thread usage policy.'' Many multi-threaded
software systems contain policies that regulate
associations among threads, executable code, and
potentially shared state. A system, for example, may
constrain which threads are permitted to execute
particular code segments, usually as a means to
constrain those threads from accessing or writing
particular elements of state. These policies ensure
properties such as state confinement or reader/writer
constraints, often without recourse to locking or
transaction discipline.\par
Our approach allows developers to concisely document
their thread usage policies in a manner that enables
the use of sound scalable analysis to assess
consistency of policy and as-written code. This paper
identifies the key semantic concepts of our thread
coloring language and illustrates how to use its
succinct source-level annotations to express models of
thread usage policies, following established annotation
conventions for Java.\par
We have built a prototype static analysis tool,
implemented as an integrated development environment
plug-in (for the Eclipse IDE), that notifies developers
of discrepancies between policy annotations and
as-written code. Our analysis technique uses several
underlying algorithms based on abstract interpretation,
call-graphs, and type inference. The resulting overall
analysis is both sound and composable. We have used
this prototype analysis tool in case studies to model
and analyze more than a million lines of code.\par
Our validation process included field trials on a wide
variety of complex large-scale production code selected
by the host organizations. Our in-field experience led
us to focus on potential adoptability by real-world
developers. We have developed techniques that can
reduce annotation density to less than one line per
thousand lines of code (KLOC). In addition, the
prototype analysis tool supports an incremental and
iterative approach to modeling and analysis. This
approach enabled field trial partners to directly
target areas of greatest concern and to achieve useful
results within a few hours.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "annotation; Java; keywords: state consistency;
multicore; race conditions; state confinement; thread
author = "Nathan R. Tallent and John M. Mellor-Crummey and Allan
title = "Analyzing lock contention in multithreaded
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "269--280",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693489",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Many programs exploit shared-memory parallelism using
multithreading. Threaded codes typically use locks to
coordinate access to shared data. In many cases,
contention for locks reduces parallel efficiency and
hurts scalability. Being able to quantify and attribute
lock contention is important for understanding where a
multithreaded program needs improvement.\par
This paper proposes and evaluates three strategies for
gaining insight into performance losses due to lock
contention. First, we consider using a straightforward
strategy based on call stack profiling to attribute
idle time and show that it fails to yield insight into
lock contention. Second, we consider an approach that
builds on a strategy previously used for analyzing
idleness in work-stealing computations; we show that
this strategy does not yield insight into lock
contention. Finally, we propose a new technique for
measurement and analysis of lock contention that uses
data associated with locks to blame lock holders for
the idleness of spinning threads. Our approach incurs $
\leq $ 5\% overhead on a quantum chemistry application
that makes extensive use of locking (65M distinct
locks, a maximum of 340K live locks, and an average of
30K lock acquisitions per second per thread) and
attributes lock contention to its full static and
dynamic calling contexts. Our strategy, implemented in
HPCToolkit, is fully distributed and should scale well
to systems with large core counts.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "HPCToolkit; lock contention; multithreading;
performance analysis",
author = "M. Tentyukov and J. A. M. Vermaseren",
title = "The multithreaded version of {FORM}",
journal = j-COMP-PHYS-COMM,
volume = "181",
number = "8",
pages = "1419--1427",
month = aug,
year = "2010",
DOI = "https://doi.org/10.1016/j.cpc.2010.04.009",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Sat Feb 11 09:54:30 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465510001207",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "Chen Tian and Min Feng and Rajiv Gupta",
title = "Speculative parallelization using state separation and
multiple value prediction",
journal = j-SIGPLAN,
volume = "45",
number = "8",
pages = "63--72",
month = aug,
year = "2010",
DOI = "https://doi.org/10.1145/1806651.1806663",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:55:48 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "With the availability of chip multiprocessor (CMP) and
simultaneous multithreading (SMT) machines, extracting
thread level parallelism from a sequential program has
become crucial for improving performance. However, many
sequential programs cannot be easily parallelized due
to the presence of dependences. To solve this problem,
different solutions have been proposed. Some of them
make the optimistic assumption that such dependences
rarely manifest themselves at runtime. However, when
this assumption is violated, the recovery causes very
large overhead. Other approaches incur large
synchronization or computation overhead when resolving
the dependences. Consequently, for a loop with
frequently arising cross-iteration dependences,
previous techniques are not able to speed up the
execution. In this paper we propose a compiler
technique which uses state separation and multiple
value prediction to speculatively parallelize loops in
sequential programs that contain frequently arising
cross-iteration dependences. The key idea is to
generate multiple versions of a loop iteration based on
multiple predictions of values of variables involved in
cross-iteration dependences (i.e., live-in variables).
These speculative versions and the preceding loop
iteration are executed in separate memory states
simultaneously. After the execution, if one of these
versions is correct (i.e., its predicted values are
found to be correct), then we merge its state and the
state of the preceding iteration because the dependence
between the two iterations is correctly resolved. The
memory states of other incorrect versions are
completely discarded. Based on this idea, we further
propose a runtime adaptive scheme that not only gives a
good performance but also achieves better CPU
utilization. We conducted experiments on 10 benchmark
programs on a real machine. The results show that our
technique can achieve 1.7x speedup on average across
all used benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "multicore processors; speculative parallelization",
author = "Emina Torlak and Mandana Vaziri and Julian Dolby",
title = "{MemSAT}: checking axiomatic specifications of memory
journal = j-SIGPLAN,
volume = "45",
number = "6",
pages = "341--350",
month = jun,
year = "2010",
DOI = "https://doi.org/10.1145/1806596.1806635",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Oct 8 17:53:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Memory models are hard to reason about due to their
complexity, which stems from the need to strike a
balance between ease-of-programming and allowing
compiler and hardware optimizations. In this paper, we
present an automated tool, MemSAT, that helps in
debugging and reasoning about memory models. Given an
axiomatic specification of a memory model and a
multi-threaded test program containing assertions,
MemSAT outputs a trace of the program in which both the
assertions and the memory model axioms are satisfied,
if one can be found. The tool is fully automatic and is
based on a SAT solver. If it cannot find a trace, it
outputs a minimal subset of the memory model and
program constraints that are unsatisfiable. We used
MemSAT to check several existing memory models against
their published test cases, including the current Java
Memory Model by Manson et al. and a revised version of
it by Sevcik and Aspinall. We found subtle
discrepancies between what was expected and the actual
results of test programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "axiomatic specifications; bounded model checking;
memory models; sat",
author = "Oleg Trott and Arthur J. Olson",
title = "{AutoDock Vina}: {Improving} the speed and accuracy of
docking with a new scoring function, efficient
optimization, and multithreading",
journal = j-J-COMPUT-CHEM,
volume = "31",
number = "2",
pages = "455--461",
day = "30",
month = jan,
year = "2010",
DOI = "https://doi.org/10.1002/jcc.21334",
ISSN = "0192-8651 (print), 1096-987X (electronic)",
ISSN-L = "0192-8651",
bibdate = "Thu Nov 29 14:55:23 MST 2012",
bibsource = "http://www.interscience.wiley.com/jpages/0192-8651;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computational Chemistry",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X",
onlinedate = "4 Jun 2009",
author = "Evangelos Vlachos and Michelle L. Goodstein and
Michael A. Kozuch and Shimin Chen and Babak Falsafi and
Phillip B. Gibbons and Todd C. Mowry",
title = "{ParaLog}: enabling and accelerating online parallel
monitoring of multithreaded applications",
journal = j-COMP-ARCH-NEWS,
volume = "38",
number = "1",
pages = "271--284",
month = mar,
year = "2010",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Mar 17 14:42:04 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Evangelos Vlachos and Michelle L. Goodstein and
Michael A. Kozuch and Shimin Chen and Babak Falsafi and
Phillip B. Gibbons and Todd C. Mowry",
title = "{ParaLog}: enabling and accelerating online parallel
monitoring of multithreaded applications",
journal = j-SIGPLAN,
volume = "45",
number = "3",
pages = "271--284",
month = mar,
year = "2010",
DOI = "https://doi.org/10.1145/1736020.1736051",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Mar 17 13:46:56 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "{\em Instruction-grain lifeguards\/} monitor the
events of a running application at the level of
individual instructions in order to identify and help
mitigate application bugs and security exploits.
Because such lifeguards impose a 10-100X slowdown on
existing platforms, previous studies have proposed
hardware designs to accelerate lifeguard processing.
However, these accelerators are either tailored to a
specific class of lifeguards or suitable only for
monitoring single-threaded programs.\par
We present ParaLog, the first design of a system
enabling fast online parallel monitoring of
multithreaded parallel applications. ParaLog supports a
broad class of software-defined lifeguards. We show how
three existing accelerators can be enhanced to support
online multithreaded monitoring, dramatically reducing
lifeguard overheads. We identify and solve several
challenges in monitoring parallel applications and/or
parallelizing these accelerators, including (i)
enforcing inter-thread data dependences, (ii) dealing
with inter-thread effects that are not reflected in
coherence traffic, (iii) dealing with unmonitored
operating system activity, and (iv) ensuring lifeguards
can access shared metadata with negligible
synchronization overheads. We present our system design
for both Sequentially Consistent and Total Store
Ordering processors. We implement and evaluate our
design on a 16 core simulated CMP, using benchmarks
from SPLASH-2 and PARSEC and two lifeguards: a
data-flow tracking lifeguard and a memory-access
checker lifeguard. Our results show that (i) our
parallel accelerators improve performance by 2-9X and
1.13-3.4X for our two lifeguards, respectively, (ii) we
are 5-126X faster than the time-slicing approach
required by existing techniques, and (iii) our average
overheads for applications with eight threads are 51\%
and 28\% for the two lifeguards, respectively.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "hardware support for debugging; instruction-grain
lifeguards; online parallel monitoring",
author = "Peter H. Welch and Jan B. Pedersen",
title = "{Santa Claus}: {Formal} analysis of a process-oriented
journal = j-TOPLAS,
volume = "32",
number = "4",
pages = "14:1--14:37",
month = apr,
year = "2010",
DOI = "https://doi.org/10.1145/1734206.1734211",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri May 21 12:47:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "With the commercial development of multicore
processors, the challenges of writing multithreaded
programs to take advantage of these new hardware
architectures are becoming more and more pertinent.
Concurrent programming is necessary to achieve the
performance that the hardware offers. Traditional
approaches present concurrency as an {\em advanced\/}
topic: they have proven difficult to use, reason about
with confidence, and scale up to high levels of
concurrency. This article reviews {\em process-oriented
design}, based on Hoare's algebra of Communicating
Sequential Processes (CSP), and proposes that this
approach to concurrency leads to solutions that are
manageable by novice programmers; that is, they are
easy to design and maintain, that they are scalable for
complexity, {\em obviously correct}, and relatively
easy to verify using formal reasoning and/or model
checkers. These solutions can be developed in
conventional programming languages (through CSP
libraries) or specialized ones (such as occam-\pi) in a
manner that directly reflects their formal expression.
Systems can be developed without needing specialist
knowledge of the CSP formalism, since the supporting
mathematics is burnt into the tools and languages
supporting it. We illustrate these concepts with the
{\em Santa Claus problem}, which has been used as a
challenge for concurrency mechanisms since 1994. We
consider this problem as an example control system,
producing external signals reporting changes of
internal state (that model the external world). We
claim our occam-\pi solution is {\em
correct-by-design}, but follow this up with formal
verification (using the FDR model checker for CSP) that
the system is free from deadlock and livelock, that the
produced control signals obey crucial ordering
constraints, and that the system has key liveness
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
keywords = "concurrency; CSP; deadlock; event ordering; liveness;
novice programmer; occam-pi; Process orientation;
author = "Piotr Wendykier and James G. Nagy",
title = "{Parallel Colt}: a High-Performance {Java} Library for
Scientific Computing and Image Processing",
journal = j-TOMS,
volume = "37",
number = "3",
pages = "31:1--31:22",
month = sep,
year = "2010",
DOI = "https://doi.org/10.1145/1824801.1824809",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Mon Sep 27 10:15:50 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Major breakthroughs in chip and software design have
been observed for the last nine years. In October 2001,
IBM released the world's first multicore processor:
POWER4. Six years later, in February 2007, NVIDIA made
a public release of CUDA SDK, a set of development
tools to write algorithms for execution on Graphic
Processing Units (GPUs). Although software vendors have
started working on parallelizing their products, the
vast majority of existing code is still sequential and
does not effectively utilize modern multicore CPUs and
manycore GPUs.\par
This article describes Parallel Colt, a multithreaded
Java library for scientific computing and image
processing. In addition to describing the design and
functionality of Parallel Colt, a comparison to MATLAB
is presented. Two ImageJ plugins for iterative image
deblurring and motion correction of PET brain images
are described as typical applications of this library.
Performance comparisons with MATLAB, including GPU
computations via AccelerEyes' Jacket toolbox are also
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
keywords = "Deconvolution; FFT; inverse problems; iterative
methods; motion correction; multithreading; PET;
author = "Kyle B. Wheeler and Douglas Thain",
title = "Visualizing massively multithreaded applications with
journal = j-CCPE,
volume = "22",
number = "1",
pages = "45--67",
month = jan,
year = "2010",
DOI = "https://doi.org/10.1002/cpe.1469",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:40 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Prac\-tice and
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "13 Aug 2009",
author = "Kyueun Yi and J.-L. Gaudiot",
title = "Network Applications on Simultaneous Multithreading
journal = j-IEEE-TRANS-COMPUT,
volume = "59",
number = "9",
pages = "1200--1209",
month = sep,
year = "2010",
DOI = "https://doi.org/10.1109/TC.2009.185",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Sun Jul 3 11:52:32 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5374374",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Eddy Z. Zhang and Yunlian Jiang and Xipeng Shen",
title = "Does cache sharing on modern {CMP} matter to the
performance of contemporary multithreaded programs?",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "203--212",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693482",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Most modern Chip Multiprocessors (CMP) feature shared
cache on chip. For multithreaded applications, the
sharing reduces communication latency among co-running
threads, but also results in cache contention.\par
A number of studies have examined the influence of
cache sharing on multithreaded applications, but most
of them have concentrated on the design or management
of shared cache, rather than a systematic measurement
of the influence. Consequently, prior measurements have
been constrained by the reliance on simulators, the use
of out-of-date benchmarks, and the limited coverage of
deciding factors. The influence of CMP cache sharing on
contemporary multithreaded applications remains
preliminarily understood.\par
In this work, we conduct a systematic measurement of
the influence on two kinds of commodity CMP machines,
using a recently released CMP benchmark suite, PARSEC,
with a number of potentially important factors on
program, OS, and architecture levels considered. The
measurement shows some surprising results. Contrary to
commonly perceived importance of cache sharing, neither
positive nor negative effects from the cache sharing
are significant for most of the program executions,
regardless of the types of parallelism, input datasets,
architectures, numbers of threads, and assignments of
threads to cores. After a detailed analysis, we find
that the main reason is the mismatch of current
development and compilation of multithreaded
applications and CMP architectures. By transforming the
programs in a cache-sharing-aware manner, we observe up
to 36\% performance increase when the threads are
placed on cores appropriately.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "chip multiprocessors; parallel program optimizations;
shared cache; thread scheduling",
author = "Yao Zhang and Jonathan Cohen and John D. Owens",
title = "Fast tridiagonal solvers on the {GPU}",
journal = j-SIGPLAN,
volume = "45",
number = "5",
pages = "127--136",
month = may,
year = "2010",
DOI = "https://doi.org/10.1145/1693453.1693472",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Aug 31 22:39:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We study the performance of three parallel algorithms
and their hybrid variants for solving tridiagonal
linear systems on a GPU: cyclic reduction (CR),
parallel cyclic reduction (PCR) and recursive doubling
(RD). We develop an approach to measure, analyze, and
optimize the performance of GPU programs in terms of
memory access, computation, and control overhead. We
find that CR enjoys linear algorithm complexity but
suffers from more algorithmic steps and bank conflicts,
while PCR and RD have fewer algorithmic steps but do
more work each step. To combine the benefits of the
basic algorithms, we propose hybrid CR+PCR and CR+RD
algorithms, which improve the performance of PCR, RD
and CR by 21\%, 31\% and 61\% respectively. Our GPU
solvers achieve up to a 28x speedup over a sequential
LAPACK solver, and a 12x speedup over a multi-threaded
CPU solver.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "GPGPU; performance optimization; tridiagonal linear
author = "David A. Zier and Ben Lee",
title = "Performance Evaluation of Dynamic Speculative
Multithreading with the {Cascadia} Architecture",
volume = "21",
number = "1",
pages = "47--59",
month = jan,
year = "2010",
DOI = "https://doi.org/10.1109/TPDS.2009.47",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu May 13 12:06:56 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Chandrajit L. Bajaj and Rezaul Chowdhury and Vinay
title = "{$ F^2 $Dock}: Fast {Fourier} Protein-Protein
journal = j-TCBB,
volume = "8",
number = "1",
pages = "45--58",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1109/TCBB.2009.57",
ISSN = "1545-5963 (print), 1557-9964 (electronic)",
ISSN-L = "1545-5963",
bibdate = "Mon Dec 20 18:39:04 MST 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The functions of proteins are often realized through
their mutual interactions. Determining a relative
transformation for a pair of proteins and their
conformations which form a stable complex, reproducible
in nature, is known as docking. It is an important step
in drug design, structure determination, and
understanding function and structure relationships. In
this paper, we extend our nonuniform fast Fourier
transform-based docking algorithm to include an
adaptive search phase (both translational and
rotational) and thereby speed up its execution. We have
also implemented a multithreaded version of the
adaptive docking algorithm for even faster execution on
multicore machines. We call this protein-protein
docking code {\rm F}^2Dock (F^2= {\rm
acknowledgement = ack-nhfb,
fjournal = "IEEE/ACM Transactions on Computational Biology and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954",
author = "Thomas Ball and Sebastian Burckhardt and Peli de
Halleux and Madan Musuvathi and Shaz Qadeer",
title = "Predictable and Progressive Testing of Multithreaded
journal = j-IEEE-SOFTWARE,
volume = "28",
number = "3",
pages = "75--83",
month = may # "\slash " # jun,
year = "2011",
DOI = "https://doi.org/10.1109/MS.2010.64",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Thu Apr 28 08:41:06 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
author = "Paolo Bientinesi and Francisco D. Igual and Daniel
Kressner and Matthias Petschow and Enrique S.
title = "Condensed forms for the symmetric eigenvalue problem
on multi-threaded architectures",
journal = j-CCPE,
volume = "23",
number = "7",
pages = "694--707",
month = may,
year = "2011",
DOI = "https://doi.org/10.1002/cpe.1680",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Dec 5 10:08:55 MST 2011",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "8 Nov 2010",
author = "Jacob Burnim and George Necula and Koushik Sen",
title = "Specifying and checking semantic atomicity for
multithreaded programs",
journal = j-COMP-ARCH-NEWS,
volume = "39",
number = "1",
pages = "79--90",
month = mar,
year = "2011",
DOI = "https://doi.org/10.1145/1961295.1950377",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Aug 18 13:45:25 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Jacob Burnim and George Necula and Koushik Sen",
title = "Specifying and checking semantic atomicity for
multithreaded programs",
journal = j-SIGPLAN,
volume = "46",
number = "3",
pages = "79--90",
month = mar,
year = "2011",
DOI = "https://doi.org/10.1145/1961296.1950377",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 24 10:55:08 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '11 conference proceedings",
author = "Michael Butler and Leslie Barnes and Debjit Das Sarma
and Bob Gelinas",
title = "{Bulldozer}: An Approach to Multithreaded Compute
journal = j-IEEE-MICRO,
volume = "31",
number = "2",
pages = "6--15",
month = mar # "\slash " # apr,
year = "2011",
DOI = "https://doi.org/10.1109/MM.2011.23",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Tue Apr 26 13:50:28 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "AMD's Bulldozer module represents a new direction in
microarchitecture and includes a number of firsts for
AMD, including AMD's multithreaded x86 processor,
implementation of a shared Level 2 cache, and x86
processor to incorporate floating-point
multiply-accumulate (FMAC). This article discusses the
module's multithreading architecture, power-efficient
microarchitecture, and subblocks, including the various
microarchitectural latencies, bandwidths, and structure
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
keywords = "Hot Chips 22 conference proceedings",
author = "Kuo-Yi Chen and J. Morris Chang and Ting-Wei Hou",
title = "Multithreading in {Java}: Performance and Scalability
on Multicore Systems",
journal = j-IEEE-TRANS-COMPUT,
volume = "60",
number = "11",
pages = "1521--1534",
month = nov,
year = "2011",
DOI = "https://doi.org/10.1109/TC.2010.232",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Sep 27 07:57:50 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5661769",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Gautham N. Chinya and Jamison D. Collins and Perry H.
Wang and Hong Jiang and Guei-Yuan Lueh and Thomas A.
Piazza and Hong Wang",
title = "{Bothnia}: a dual-personality extension to the {Intel}
integrated graphics driver",
journal = j-OPER-SYS-REV,
volume = "45",
number = "1",
pages = "11--20",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1945023.1945027",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Feb 25 16:43:23 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we introduce Bothnia, an extension to
the Intel production graphics driver to support a
shared virtual memory heterogeneous multithreading
programming model. With Bothnia, the Intel graphics
device driver can support both the traditional 3D
graphics rendering software stack and a new class of
heterogeneous multithreaded applications, which can use
both IA (Intel Architecture) CPU cores and Intel
integrated Graphics and Media Accelerator (GMA) cores
in the same virtual address space. We describe the
necessary architectural supports in both IA CPU and the
GMA cores and present a reference Bothnia
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Timothy A. Davis",
title = "{Algorithm 915}, {SuiteSparseQR}: {Multifrontal}
multithreaded rank-revealing sparse {QR}
journal = j-TOMS,
volume = "38",
number = "1",
pages = "8:1--8:22",
month = nov,
year = "2011",
DOI = "https://doi.org/10.1145/2049662.2049670",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Thu Dec 15 08:59:34 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "SuiteSparseQR is a sparse QR factorization package
based on the multifrontal method. Within each frontal
matrix, LAPACK and the multithreaded BLAS enable the
method to obtain high performance on multicore
architectures. Parallelism across different frontal
matrices is handled with Intel's Threading Building
Blocks library. The symbolic analysis and ordering
phase pre-eliminates singletons by permuting the input
matrix A into the form [R11 R12; 0 A22] where R11 is
upper triangular with diagonal entries above a given
tolerance. Next, the fill-reducing ordering, column
elimination tree, and frontal matrix structures are
found without requiring the formation of the pattern of
ATA. Approximate rank-detection is performed within
each frontal matrix using Heath's method.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Javier Esparza and Pierre Ganty",
title = "Complexity of pattern-based verification for
multithreaded programs",
journal = j-SIGPLAN,
volume = "46",
number = "1",
pages = "499--510",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1925844.1926443",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jan 26 15:06:39 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Frank Feinbube and Peter Troger and Andreas Polze",
title = "Joint Forces: From Multithreaded Programming to {GPU}
journal = j-IEEE-SOFTWARE,
volume = "28",
number = "1",
pages = "51--57",
month = jan # "\slash " # feb,
year = "2011",
DOI = "https://doi.org/10.1109/MS.2010.134",
ISSN = "0740-7459 (print), 0740-7459 (electronic)",
ISSN-L = "0740-7459",
bibdate = "Thu Dec 23 16:29:15 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Software",
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software",
author = "Karthik Ganesan and Lizy K. John",
title = "{MAximum Multicore POwer (MAMPO)}: an automatic
multithreaded synthetic power virus generation
framework for multicore systems",
crossref = "Lathrop:2011:SPI",
pages = "53:1--53:12",
year = "2011",
DOI = "https://doi.org/10.1145/2063384.2063455",
bibdate = "Fri Dec 16 11:05:47 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
articleno = "53",
author = "Ashutosh Gupta and Corneliu Popeea and Andrey
title = "Predicate abstraction and refinement for verifying
multi-threaded programs",
journal = j-SIGPLAN,
volume = "46",
number = "1",
pages = "331--344",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1925844.1926424",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jan 26 15:06:39 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Bo Hong and Zhengyu He",
title = "An Asynchronous Multithreaded Algorithm for the
Maximum Network Flow Problem with Nonblocking Global
Relabeling Heuristic",
volume = "22",
number = "6",
pages = "1025--1033",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1109/TPDS.2010.156",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Jul 22 07:53:43 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Chia-Jui Hsu and Jos{\'e} Luis Pino and Shuvra S.
title = "Multithreaded Simulation for Synchronous Dataflow
journal = j-TODAES,
volume = "16",
number = "3",
pages = "25:1--25:??",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1145/1970353.1970358",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Tue Jun 14 11:55:50 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "For system simulation, Synchronous DataFlow (SDF) has
been widely used as a core model of computation in
design tools for digital communication and signal
processing systems. The traditional approach for
simulating SDF graphs is to compute and execute static
schedules in single-processor desktop environments.
Nowadays, however, multicore processors are
increasingly popular desktop platforms for their
potential performance improvements through thread-level
parallelism. Without novel scheduling and simulation
techniques that explicitly explore thread-level
parallelism for executing SDF graphs, current design
tools gain only minimal performance improvements on
multicore platforms. In this article, we present a new
multithreaded simulation scheduler, called MSS, to
provide simulation runtime speedup for executing SDF
graphs on multicore processors.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Dennis Jeffrey and Yan Wang and Chen Tian and Rajiv
title = "Isolating bugs in multithreaded programs using
execution suppression",
journal = j-SPE,
volume = "41",
number = "11",
pages = "1259--1288",
month = oct,
year = "2011",
DOI = "https://doi.org/10.1002/spe.1040",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Thu Sep 29 14:49:13 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Software --- Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "18 Jan 2011",
author = "Pramod G. Joisha and Robert S. Schreiber and
Prithviraj Banerjee and Hans J. Boehm and Dhruva R.
title = "A technique for the effective and automatic reuse of
classical compiler optimizations on multithreaded
journal = j-SIGPLAN,
volume = "46",
number = "1",
pages = "623--636",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1925844.1926457",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Jan 26 15:06:39 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Guodong Li and Robert Palmer and Michael DeLisi and
Ganesh Gopalakrishnan and Robert M. Kirby",
title = "Formal specification of {MPI 2.0}: {Case} study in
specifying a practical concurrent programming {API}",
volume = "76",
number = "2",
pages = "65--81",
day = "1",
month = feb,
year = "2011",
ISSN = "0167-6423 (print), 1872-7964 (electronic)",
ISSN-L = "0167-6423",
bibdate = "Fri Apr 1 18:39:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "Science of Computer Programming",
journal-URL = "http://www.sciencedirect.com/science/journal/01676423/",
author = "Sheng Li and Shannon Kuntz and Jay B. Brockman and
Peter M. Kogge",
title = "{Lightweight Chip Multi-Threading (LCMT)}: Maximizing
Fine-Grained Parallelism On-Chip",
volume = "22",
number = "7",
pages = "1178--1191",
month = jul,
year = "2011",
DOI = "https://doi.org/10.1109/TPDS.2010.169",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Jul 22 07:54:38 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Xiongfei Liao and Thambipillai Srikanthan",
title = "Accelerating {UNISIM}-Based Cycle-Level
Microarchitectural Simulations on Multicore Platforms",
journal = j-TODAES,
volume = "16",
number = "3",
pages = "26:1--26:??",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1145/1970353.1970359",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Tue Jun 14 11:55:50 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "UNISIM has been shown to ease the development of
simulators for multi-/many-core systems. However,
UNISIM cycle-level simulations of large-scale
multiprocessor systems could be very time consuming. In
this article, we propose a systematic framework for
accelerating UNISIM cycle-level simulations on
multicore platforms. The proposed framework relies on
exploiting the fine-grained parallelism within the
simulated cycles using POSIX threads. A multithreaded
simulation engine has been devised from the
single-threaded UNISIM SystemC engine to facilitate the
exploitation of inherent parallelism. An adaptive
technique that manages the overall computation workload
by adjusting the number of threads employed at any
given time is proposed. In addition, we have introduced
a technique to balance the workloads of multithreaded
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang",
title = "Scalable power control for many-core architectures
running multi-threaded applications",
journal = j-COMP-ARCH-NEWS,
volume = "39",
number = "3",
pages = "449--460",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1145/2024723.2000117",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Mon Sep 5 17:15:11 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Basel A. Mahafzah",
title = "Parallel multithreaded {IDA*} heuristic search:
algorithm design and performance evaluation",
journal = j-INT-J-PAR-EMER-DIST-SYS,
volume = "26",
number = "1",
pages = "61--82",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1080/17445761003604521",
ISSN = "1744-5760 (print), 1744-5779 (electronic)",
ISSN-L = "1744-5760",
bibdate = "Mon Sep 5 20:33:09 MDT 2011",
bibsource = "http://www.informaworld.com/smpp/title~content=t713729127~link=cover;
acknowledgement = ack-nhfb,
journal-URL = "http://www.tandfonline.com/loi/gpaa20",
onlinedate = "6 Dec 2010",
author = "Daniel Marino and Abhayendra Singh and Todd Millstein
and Madanlal Musuvathi and Satish Narayanasamy",
title = "A case for an {SC}-preserving compiler",
journal = j-SIGPLAN,
volume = "46",
number = "6",
pages = "199--210",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1145/1993316.1993522",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Jun 9 10:23:33 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The most intuitive memory consistency model for
shared-memory multi-threaded programming is sequential
consistency (SC). However, current concurrent
programming languages support a relaxed model, as such
relaxations are deemed necessary for enabling important
optimizations. This paper demonstrates that an
SC-preserving compiler, one that ensures that every SC
behavior of a compiler-generated binary is an SC
behavior of the source program, retains most of the
performance benefits of an optimizing compiler. The key
observation is that a large class of optimizations
crucial for performance are either already
SC-preserving or can be modified to preserve SC while
retaining much of their effectiveness. An SC-preserving
compiler, obtained by restricting the optimization
phases in LLVM, a state-of-the-art C/C++ compiler,
incurs an average slowdown of 3.8\% and a maximum
slowdown of 34\% on a set of 30 programs from the
SPLASH-2, PARSEC, and SPEC CINT2006 benchmark
While the performance overhead of preserving SC in the
compiler is much less than previously assumed, it might
still be unacceptable for certain applications. We
believe there are several avenues for improving
performance without giving up SC-preservation. In this
vein, we observe that the overhead of our SC-preserving
compiler arises mainly from its inability to
aggressively perform a class of optimizations we
identify as eager-load optimizations. This class
includes common-subexpression elimination, constant
propagation, global value numbering, and common cases
of loop-invariant code motion. We propose a notion of
interference checks in order to enable eager-load
optimizations while preserving SC. Interference checks
expose to the compiler a commonly used hardware
speculation mechanism that can efficiently detect
whether a particular variable has changed its value
since last read.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
keywords = "LLVM compiler suite; sequential consistency (SC)",
author = "Robert Preissl and Nathan Wichmann and Bill Long and
John Shalf and Stephane Ethier and Alice Koniges",
title = "Multithreaded Global Address Space Communication
Techniques for Gyrokinetic Fusion Applications on
Ultra-Scale Platforms",
crossref = "Lathrop:2011:SPI",
pages = "12:1--12:11",
year = "2011",
DOI = "https://doi.org/10.1145/2063384.2071033",
bibdate = "Fri Dec 16 11:05:47 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
author = "Pablo Prieto and Valentin Puente and Jose-Angel
title = "Multilevel Cache Modeling for Chip-Multiprocessor
volume = "10",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "This paper presents a simple analytical model for
predicting on-chip cache hierarchy effectiveness in
chip multiprocessors (CMP) for a state-of-the-art
architecture. Given the complexity of this type of
systems, we use rough approximations, such as the
empirical observation that the re-reference timing
pattern follows a power law and the assumption of a
simplistic delay model for the cache, in order to
provide a useful model for the memory hierarchy
responsiveness. This model enables the analytical
determination of average access time, which makes
design space pruning useful before sweeping the vast
design space of this class of systems. The model is
also useful for predicting cache hierarchy behavior in
future systems. The fidelity of the model has been
validated using a state-of-the-art, full-system
simulation environment, on a system with up to sixteen
out-of-order processors with cache-coherent caches and
using a broad spectrum of applications, including
complex multithread workloads. This simple model can
predict a near-to-optimal, on-chip cache distribution
while also estimating how future systems running future
applications might behave.",
acknowledgement = ack-nhfb,
affiliation = "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
Jose-Angel, Univ Cantabria, Cantabria, Spain.",
author-email = "prietop@unican.es vpuente@unican.es
da = "2019-06-20",
doc-delivery-number = "855NW",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Innovation
[TIN2010-18159]; HiPEAC2 European Network of
funding-text = "This work has been supported by the Spanish Ministry
of Science and Innovation, under contracts
TIN2010-18159, and by the HiPEAC2 European Network of
Excellence. The authors would like to thank the
reviewers for their valuable comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "13",
ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
Valentin/0000-0002-6904-3282 Gregorio, Jose
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Prieto:2011:MCM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Dheeraj Reddy and David Koufaty and Paul Brett and
Scott Hahn",
title = "Bridging functional heterogeneity in multicore
journal = j-OPER-SYS-REV,
volume = "45",
number = "1",
pages = "21--33",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1945023.1945028",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Feb 25 16:43:23 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Heterogeneous processors that mix big high performance
cores with small low power cores promise excellent
single-threaded performance coupled with high
multi-threaded throughput and higher
performance-per-watt. A significant portion of the
commercial multicore heterogeneous processors are
likely to have a common instruction set architecture(
ISA). However, due to limited design resources and
goals, each core is likely to contain ISA extensions
not yet implemented in the other core. Therefore, such
heterogeneous processors will have inherent functional
asymmetry at the ISA level and face significant
software challenges. This paper analyzes the software
challenges to the operating system and the application
layer software on a heterogeneous system with
functional asymmetry, where the ISA of the small and
big cores overlaps.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Soumyaroop Roy and Nagarajan Ranganathan and Srinivas
title = "State-Retentive Power Gating of Register Files in
Multicore Processors Featuring Multithreaded In-Order
journal = j-IEEE-TRANS-COMPUT,
volume = "60",
number = "11",
pages = "1547--1560",
month = nov,
year = "2011",
DOI = "https://doi.org/10.1109/TC.2010.249",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Sep 27 07:57:50 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5669257",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "M. Sch{\"o}nherr and K. Kucher and M. Geier and M.
Stiebler and S. Freudiger and M. Krafczyk",
title = "Multi-thread implementations of the lattice
{Boltzmann} method on non-uniform grids for {CPUs} and
journal = j-COMPUT-MATH-APPL,
volume = "61",
number = "12",
pages = "3730--3743",
month = jun,
year = "2011",
ISSN = "0898-1221 (print), 1873-7668 (electronic)",
ISSN-L = "0898-1221",
bibdate = "Wed Mar 1 21:50:48 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computmathappl2010.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0898122111002999",
acknowledgement = ack-nhfb,
fjournal = "Computers and Mathematics with Applications",
journal-URL = "http://www.sciencedirect.com/science/journal/08981221",
author = "Xuping Tu and Hai Jin and Zhibin Yu and Jie Chen and
Yabin Hu and Xie Xia",
title = "{MT-BTRIMER}: A master-slave multi-threaded dynamic
binary translator",
volume = "26",
number = "5",
pages = "??--??",
month = sep,
year = "2011",
ISSN = "0267-6192",
ISSN-L = "0267-6192",
bibdate = "Tue Dec 3 12:04:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computsystscieng.bib;
acknowledgement = ack-nhfb,
fjournal = "International Journal of Computer Systems Science and
author = "Robert A. {Van De Geijn} and Field G. {Van Zee}",
title = "High-performance up-and-downdating via
{Householder}-like transformations",
journal = j-TOMS,
volume = "38",
number = "1",
pages = "4:1--4:17",
month = nov,
year = "2011",
DOI = "https://doi.org/10.1145/2049662.2049666",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Thu Dec 15 08:59:34 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present high-performance algorithms for
up-and-downdating a Cholesky factor or QR
factorization. The method uses Householder-like
transformations, sometimes called hyperbolic
Householder transformations, that are accumulated so
that most computation can be cast in terms of
high-performance matrix-matrix operations. The
resulting algorithms can then be used as building
blocks for an algorithm-by-blocks that allows
computation to be conveniently scheduled to
multithreaded architectures like multicore processors.
Performance is shown to be similar to that achieved by
a blocked QR factorization via Householder
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Hans Vandierendonck and Andre Seznec",
title = "Fairness Metrics for Multi-Threaded Processors",
volume = "10",
number = "1",
pages = "4--7",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Multi-threaded processors execute multiple threads
concurrently in order to increase overall throughput.
It is well documented that multi-threading affects
per-thread performance but, more importantly, some
threads are affected more than others. This is
especially troublesome for multi-programmed workloads.
Fairness metrics measure whether all threads are
affected equally. However defining equal treatment is
not straightforward. Several fairness metrics for
multi-threaded processors have been utilized in the
literature, although there does not seem to be a
consensus on what metric does the best job of measuring
fairness. This paper reviews the prevalent fairness
metrics and analyzes their main properties. Each metric
strikes a different trade-off between fairness in the
strict sense and throughput. We categorize the metrics
with respect to this property. Based on experimental
data for SMT processors, we suggest using the minimum
fairness metric in order to balance fairness and
acknowledgement = ack-nhfb,
affiliation = "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
author-email = "hans.vandierendonck@elis.ugent.be
da = "2019-06-20",
doc-delivery-number = "773ZN",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "fairness; measurement; multi-programming;
Multi-threaded processors; quality-of-service",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "13",
unique-id = "Vandierendonck:2011:FMM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Managing {SMT} resource usage through speculative
instruction window weighting",
journal = j-TACO,
volume = "8",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019611",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Simultaneous multithreading processors dynamically
share processor resources between multiple threads. In
general, shared SMT resources may be managed
explicitly, for instance, by dynamically setting queue
occupation bounds for each thread as in the DCRA and
Hill-Climbing policies. Alternatively, resources may be
managed implicitly; that is, resource usage is
controlled by placing the desired instruction mix in
the resources. In this case, the main resource
management tool is the instruction fetch policy which
must predict the behavior of each thread (branch
mispredictions, long-latency loads, etc.) as it fetches
instructions. In this article, we present the use of
Speculative Instruction Window Weighting (SIWW) to
bridge the gap between implicit and explicit SMT fetch
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and
Sung-En Wang and Edwin Kan and G. Edward Suh",
title = "{SRAM--DRAM} hybrid memory with applications to
efficient register files in fine-grained
journal = j-COMP-ARCH-NEWS,
volume = "39",
number = "3",
pages = "247--258",
month = jun,
year = "2011",
DOI = "https://doi.org/10.1145/2024723.2000094",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Mon Sep 5 17:15:11 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Qin Zhao and David Koh and Syed Raza and Derek
Bruening and Weng-Fai Wong and Saman Amarasinghe",
title = "Dynamic cache contention detection in multi-threaded
journal = j-SIGPLAN,
volume = "46",
number = "7",
pages = "27--38",
month = jul,
year = "2011",
DOI = "https://doi.org/10.1145/2007477.1952688",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Sep 16 10:02:34 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "David (Yu) Zhu and Jaeyeon Jung and Dawn Song and
Tadayoshi Kohno and David Wetherall",
title = "{TaintEraser}: protecting sensitive data leaks using
application-level taint tracking",
journal = j-OPER-SYS-REV,
volume = "45",
number = "1",
pages = "142--154",
month = jan,
year = "2011",
DOI = "https://doi.org/10.1145/1945023.1945039",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Feb 25 16:43:23 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We present TaintEraser, a new tool that tracks the
movement of sensitive user data as it flows through
off-the-shelf applications. TaintEraser uses
application-level dynamic taint analysis to let users
run applications in their own environment while
preventing unwanted information exposure. It is made
possible by techniques we developed for accurate and
efficient tainting: (1) Semantic-aware
instruction-level tainting is critical to track taint
accurately, without explosion or loss. (2) Function
summaries provide an interface to handle taint
propagation within the kernel and reduce the overhead
of instruction-level tracking. (3) On-demand
instrumentation enables fast loading of large
applications. Together, these techniques let us analyze
large, multi-threaded, networked applications in near
acknowledgement = ack-nhfb,
fjournal = "ACM SIGOPS Operating Systems Review",
author = "Xiaotong Zhuang and Santosh Pande",
title = "Compiler-Supported Thread Management for Multithreaded
Network Processors",
journal = j-TECS,
volume = "10",
number = "4",
pages = "44:1--44:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043662.2043668",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Mon Dec 19 15:49:06 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Traditionally, runtime management involving CPU
sharing, real-time scheduling, etc., is provided by the
runtime environment (typically an operating system)
using hardware support such as timers and interrupts.
However, due to stringent performance requirements on
network processors, neither OS nor hardware mechanisms
are typically feasible/available. Mapping packet
processing tasks on network processors involves complex
trade-offs to maximize parallelism and pipelining. Due
to an increase in the size of the code store and
complexity of application requirements, network
processors are being programmed with heterogeneous
threads that may execute code belonging to different
tasks on a given micro-engine. Also, most network
applications are streaming applications that are
typically processed in a pipelined fashion.",
acknowledgement = ack-nhfb,
articleno = "44",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
author = "Jung Ho Ahn and Norman P. Jouppi and Christos
Kozyrakis and Jacob Leverich and Robert S. Schreiber",
title = "Improving System Energy Efficiency with Memory Rank
journal = j-TACO,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133386",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "VLSI process technology scaling has enabled dramatic
improvements in the capacity and peak bandwidth of DRAM
devices. However, current standard DDR x DIMM memory
interfaces are not well tailored to achieve high energy
efficiency and performance in modern
chip-multiprocessor-based computer systems. Their
suboptimal performance and energy inefficiency can have
a significant impact on system-wide efficiency since
much of the system power dissipation is due to memory
power. New memory interfaces, better suited for future
many-core systems, are needed. In response, there are
recent proposals to enhance the energy efficiency of
main-memory systems by dividing a memory rank into
subsets, and making a subset rather than a whole rank
serve a memory request. We holistically assess the
effectiveness of rank subsetting from system-wide
performance, energy-efficiency, and reliability
perspectives. We identify the impact of rank subsetting
on memory power and processor performance analytically,
compare two promising rank-subsetting proposals,
Multicore DIMM and mini-rank, and verify our analysis
by simulating a chip-multiprocessor system using
multithreaded and consolidated workloads. We extend the
design of Multicore DIMM for high-reliability systems
and show that compared with conventional chipkill
approaches, rank subsetting can lead to much higher
system-level energy efficiency and performance at the
cost of additional DRAM devices. This holistic
assessment shows that rank subsetting offers compelling
alternatives to existing processor-memory interfaces
for future DDR systems.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Jos{\'e} I. Aliaga and Paolo Bientinesi and Davor
Davidovi{\'c} and Edoardo {Di Napoli} and Francisco D.
Igual and Enrique S. Quintana-Ort{\'\i}",
title = "Solving dense generalized eigenproblems on
multi-threaded architectures",
journal = j-APPL-MATH-COMP,
volume = "218",
number = "22",
pages = "11279--11289",
day = "15",
month = jul,
year = "2012",
DOI = "https://doi.org/10.1016/j.amc.2012.05.020",
ISSN = "0096-3003 (print), 1873-5649 (electronic)",
ISSN-L = "0096-3003",
bibdate = "Mon Jun 25 12:18:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S009630031200505X",
acknowledgement = ack-nhfb,
fjournal = "Applied Mathematics and Computation",
journal-URL = "http://www.sciencedirect.com/science/journal/00963003",
author = "Jos{\'e}-Mar{\'\i}a Arnau and Joan-Manuel Parcerisa
and Polychronis Xekalakis",
title = "Boosting mobile {GPU} performance with a decoupled
access\slash execute fragment processor",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "84--93",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337169",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Smartphones represent one of the fastest growing
markets, providing significant hardware/software
improvements every few months. However, supporting
these capabilities reduces the operating time per
battery charge. The CPU/GPU component is only left with
a shrinking fraction of the power budget, since most of
the energy is consumed by the screen and the antenna.
In this paper, we focus on improving the energy
efficiency of the GPU since graphical applications
consist an important part of the existing market.
Moreover, the trend towards better screens will
inevitably lead to a higher demand for improved
graphics rendering. We show that the main bottleneck
for these applications is the texture cache and that
traditional techniques for hiding memory latency
(prefetching, multithreading) do not work well or come
at a high energy cost. We thus propose the migration of
GPU designs towards the decoupled access-execute
concept. Furthermore, we significantly reduce bandwidth
usage in the decoupled architecture by exploiting
inter-core data sharing. Using commercial Android
applications, we show that the end design can achieve
93\% of the performance of a heavily multithreaded GPU
while providing energy savings of 34\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Sara S. Baghsorkhi and Isaac Gelado and Matthieu
Delahaye and Wen-mei W. Hwu",
title = "Efficient performance evaluation of memory hierarchy
for highly multithreaded graphics processors",
journal = j-SIGPLAN,
volume = "47",
number = "8",
pages = "23--34",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1145/2370036.2145820",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Sep 12 12:11:57 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PPOPP '12 conference proceedings.",
abstract = "With the emergence of highly multithreaded
architectures, performance monitoring techniques face
new challenges in efficiently locating sources of
performance discrepancies in the program source code.
For example, the state-of-the-art performance counters
in highly multithreaded graphics processing units
(GPUs) report only the overall occurrences of
microarchitecture events at the end of program
execution. Furthermore, even if supported, any
fine-grained sampling of performance counters will
distort the actual program behavior and will make the
sampled values inaccurate. On the other hand, it is
difficult to achieve high resolution performance
information at low sampling rates in the presence of
thousands of concurrently running threads. In this
paper, we present a novel software-based approach for
monitoring the memory hierarchy performance in highly
multithreaded general-purpose graphics processors. The
proposed analysis is based on memory traces collected
for snapshots of an application execution. A
trace-based memory hierarchy model with a Monte Carlo
experimental methodology generates statistical bounds
of performance measures without being concerned about
the exact inter-thread ordering of individual events
but rather studying the behavior of the overall system.
The statistical approach overcomes the classical
problem of disturbed execution timing due to
fine-grained instrumentation. The approach scales well
as we deploy an efficient parallel trace collection
technique to reduce the trace generation overhead and a
simple memory hierarchy model to reduce the simulation
time. The proposed scheme also keeps track of
individual memory operations in the source code and can
quantify their efficiency with respect to the memory
system. A cross-validation of our results shows close
agreement with the values read from the hardware
performance counters on an NVIDIA Tesla C2050 GPU.
Based on the high resolution profile data produced by
our model we optimized memory accesses in the sparse
matrix vector multiply kernel and achieved speedups
ranging from 2.4 to 14.8 depending on the
characteristics of the input matrices.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Ahmed Bouajjani and Michael Emmi",
title = "Analysis of recursively parallel programs",
journal = j-SIGPLAN,
volume = "47",
number = "1",
pages = "203--214",
month = jan,
year = "2012",
DOI = "https://doi.org/10.1145/2103621.2103681",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Mar 15 18:16:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "We propose a general formal model of isolated
hierarchical parallel computations, and identify
several fragments to match the concurrency constructs
present in real-world programming languages such as
Cilk and X10. By associating fundamental formal models
(vector addition systems with recursive transitions) to
each fragment, we provide a common platform for
exposing the relative difficulties of algorithmic
reasoning. For each case we measure the complexity of
deciding state-reachability for finite-data recursive
programs, and propose algorithms for the decidable
cases. The complexities which include PTIME, NP,
EXPSPACE, and 2EXPTIME contrast with undecidable
state-reachability for recursive multi-threaded
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '12 conference proceedings.",
author = "David Burgess and Edmund Gieske and James Holt and
Thomas Hoy and Gary Whisenhunt",
title = "{e6500}: {Freescale}'s Low-Power, High-Performance
Multithreaded Embedded Processor",
journal = j-IEEE-MICRO,
volume = "32",
number = "5",
pages = "26--36",
month = sep # "\slash " # oct,
year = "2012",
DOI = "https://doi.org/10.1109/MM.2012.55",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Nov 15 05:59:33 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Jacob Burnim and George Necula and Koushik Sen",
title = "Specifying and checking semantic atomicity for
multithreaded programs",
journal = j-SIGPLAN,
volume = "47",
number = "4",
pages = "79--90",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1145/2248487.1950377",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Jun 7 08:15:03 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In practice, it is quite difficult to write correct
multithreaded programs due to the potential for
unintended and nondeterministic interference between
parallel threads. A fundamental correctness property
for such programs is atomicity---a block of code in a
program is atomic if, for any parallel execution of the
program, there is an execution with the same overall
program behavior in which the block is executed
serially. We propose semantic atomicity, a
generalization of atomicity with respect to a
programmer-defined notion of equivalent behavior. We
propose an assertion framework in which a programmer
can use bridge predicates to specify noninterference
properties at the level of abstraction of their
application. Further, we propose a novel algorithm for
systematically testing atomicity specifications on
parallel executions with a bounded number of
interruptions---i.e. atomic blocks whose execution is
interleaved with that of other threads. We further
propose a set of sound heuristics and optional user
annotations that increase the efficiency of checking
atomicity specifications in the common case where the
specifications hold. We have implemented our assertion
framework for specifying and checking semantic
atomicity for parallel Java programs, and we have
written semantic atomicity specifications for a number
of benchmarks. We found that using bridge predicates
allowed us to specify the natural and intended atomic
behavior of a wider range of programs than did previous
approaches. Further, in checking our specifications, we
found several previously unknown bugs, including in the
widely-used java.util.concurrent library.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '12 conference proceedings.",
author = "{\"U}mit V. {\c{C}}ataly{\"u}rek and John Feo and
Assefaw H. Gebremedhin and Mahantesh Halappanavar and
Alex Pothen",
title = "Graph coloring algorithms for multi-core and massively
multithreaded architectures",
volume = "38",
number = "10--11",
pages = "576--594",
month = oct # "\slash " # nov,
year = "2012",
DOI = "https://doi.org/10.1016/j.parco.2012.07.001",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Thu Oct 25 09:00:31 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167819112000592",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Guancheng Chen and Per Stenstrom",
title = "Critical lock analysis: diagnosing critical section
bottlenecks in multithreaded applications",
crossref = "Hollingsworth:2012:SPI",
pages = "71:1--71:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a099.pdf",
abstract = "Critical sections are well known potential performance
bottlenecks in multithreaded applications and
identifying the ones that inhibit scalability are
important for performance optimizations. While previous
approaches use idle time as a key measure, we show such
a measure is not reliable. The reason is that idleness
does not necessarily mean the critical section is on
the critical path. We introduce critical lock analysis,
a new method for diagnosing critical section
bottlenecks in multithreaded applications. Our method
firstly identifies the critical sections appearing on
the critical path, and then quantifies the impact of
such critical sections on the overall performance by
using quantitative performance metrics. Case studies
show that our method can successfully identify critical
sections that are most beneficial for improving overall
performance as well as quantify their performance
impact on the critical path, which results in a more
reliable establishment of the inherent critical section
bottlenecks than previous approaches.",
acknowledgement = ack-nhfb,
articleno = "71",
author = "Chih-Yuan Chen and Jhong-Yi Ciou and Rong-Guey Chang",
title = "Multi-level simultaneous multithreading scheduling to
reduce the temperature of register files",
journal = j-CCPE,
volume = "24",
number = "12",
pages = "1296--1316",
day = "25",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1002/cpe.1831",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Mon Nov 5 07:44:51 MST 2012",
bibsource = "http://www.interscience.wiley.com/jpages/1532-0626;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "22 Sep 2011",
author = "Austin T. Clements and M. Frans Kaashoek and Nickolai
title = "Scalable address spaces using {RCU} balanced trees",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "1",
pages = "199--210",
month = mar,
year = "2012",
DOI = "https://doi.org/10.1145/2189750.2150998",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 1 17:06:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ASPLOS '12 conference proceedings.",
abstract = "Software developers commonly exploit multicore
processors by building multithreaded software in which
all threads of an application share a single address
space. This shared address space has a cost: kernel
virtual memory operations such as handling soft page
faults, growing the address space, mapping files, etc.
can limit the scalability of these applications. In
widely-used operating systems, all of these operations
are synchronized by a single per-process lock. This
paper contributes a new design for increasing the
concurrency of kernel operations on a shared address
space by exploiting read-copy-update (RCU) so that soft
page faults can both run in parallel with operations
that mutate the same address space and avoid contending
with other page faults on shared cache lines. To enable
such parallelism, this paper also introduces an
RCU-based binary balanced tree for storing memory
mappings. An experimental evaluation using three
multithreaded applications shows performance
improvements on 80 cores ranging from 1.7x to 3.4x for
an implementation of this design in the Linux 2.6.37
kernel. The RCU-based binary tree enables soft page
faults to run at a constant cost with an increasing
number of cores,suggesting that the design will scale
well beyond 80 cores.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Joseph Devietti and Benjamin P. Wood and Karin Strauss
and Luis Ceze and Dan Grossman and Shaz Qadeer",
title = "{RADISH}: always-on sound and complete
{{\underline{Ra}ce \underline{D}etection \underline{i}n
\underline{S}oftware and \underline{H}ardware}}",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "201--212",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337182",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Data-race freedom is a valuable safety property for
multithreaded programs that helps with catching bugs,
simplifying memory consistency model semantics, and
verifying and enforcing both atomicity and determinism.
Unfortunately, existing software-only dynamic race
detectors are precise but slow; proposals with hardware
support offer higher performance but are imprecise.
Both precision and performance are necessary to achieve
the many advantages always-on dynamic race detection
could provide. To resolve this trade-off, we propose
Radish, a hybrid hardware-software dynamic race
detector that is always-on and fully precise. In
Radish, hardware caches a principled subset of the
metadata necessary for race detection; this subset
allows the vast majority of race checks to occur
completely in hardware. A flexible software layer
handles persistence of race detection metadata on cache
evictions and occasional queries to this expanded set
of metadata. We show that Radish is correct by proving
equivalence to a conventional happens-before race
detector. Our design has modest hardware complexity:
caches are completely unmodified and we piggy-back on
existing coherence messages but do not otherwise modify
the protocol. Furthermore, Radish can leverage
type-safe languages to reduce overheads substantially.
Our evaluation of a simulated 8-core Radish processor
using PARSEC benchmarks shows runtime overheads from
negligible to 2x, outperforming the leading
software-only race detector by 2x-37x.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Joseph Devietti and Jacob Nelson and Tom Bergan and
Luis Ceze and Dan Grossman",
title = "{RCDC}: a relaxed consistency deterministic computer",
journal = j-SIGPLAN,
volume = "47",
number = "4",
pages = "67--78",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1145/2248487.1950376",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Jun 7 08:15:03 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Providing deterministic execution significantly
simplifies the debugging, testing, replication, and
deployment of multithreaded programs. Recent work has
developed deterministic multiprocessor architectures as
well as compiler and runtime systems that enforce
determinism in current hardware. Such work has
incidentally imposed strong memory-ordering properties.
Historically, memory ordering has been relaxed in favor
of higher performance in shared memory multiprocessors
and, interestingly, determinism exacerbates the cost of
strong memory ordering. Consequently, we argue that
relaxed memory ordering is vital to achieving faster
deterministic execution. This paper introduces RCDC, a
deterministic multiprocessor architecture that takes
advantage of relaxed memory orderings to provide
high-performance deterministic execution with low
hardware complexity. RCDC has two key innovations: a
hybrid HW/SW approach to enforcing determinism; and a
new deterministic execution strategy that leverages
data-race-free-based memory models (e.g., the models
for Java and C++) to improve performance and
scalability without sacrificing determinism, even in
the presence of races. In our hybrid HW/SW approach,
the only hardware mechanisms required are
software-controlled store buffering and support for
precise instruction counting; we do not require
speculation. A runtime system uses these mechanisms to
enforce determinism for arbitrary programs. We evaluate
RCDC using PARSEC benchmarks and show that relaxing
memory ordering leads to performance and scalability
close to nondeterministic execution without requiring
any form of speculation. We also compare our new
execution strategy to one based on TSO
(total-store-ordering) and show that some applications
benefit significantly from the extra relaxation. We
also evaluate a software-only implementation of our new
deterministic execution strategy.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '12 conference proceedings.",
author = "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and
Seung Woo Son",
title = "Compiler-directed file layout optimization for
hierarchical storage systems",
crossref = "Hollingsworth:2012:SPI",
pages = "41:1--41:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a030.pdf",
abstract = "File layout of array data is a critical factor that
effects the behavior of storage caches, and has so far
taken not much attention in the context of hierarchical
storage systems. The main contribution of this paper is
a compiler-driven file layout optimization scheme for
hierarchical storage caches. This approach, fully
automated within an optimizing compiler, analyzes a
multi-threaded application code and determines a file
layout for each disk-resident array referenced by the
code, such that the performance of the target storage
cache hierarchy is maximized. We tested our approach
using 16 I/O intensive application programs and
compared its performance against two previously
proposed approaches under different cache space
management schemes. Our experimental results show that
the proposed approach improves the execution time of
these parallel applications by 23.7\% on average.",
acknowledgement = ack-nhfb,
articleno = "41",
author = "Julian Dolby and Christian Hammer and Daniel Marino
and Frank Tip and Mandana Vaziri and Jan Vitek",
title = "A data-centric approach to synchronization",
journal = j-TOPLAS,
volume = "34",
number = "1",
pages = "4:1--4:48",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1145/2160910.2160913",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Mon Apr 30 17:20:50 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Concurrency-related errors, such as data races, are
frustratingly difficult to track down and eliminate in
large object-oriented programs. Traditional approaches
to preventing data races rely on protecting instruction
sequences with synchronization operations. Such
control-centric approaches are inherently brittle, as
the burden is on the programmer to ensure that all
concurrently accessed memory locations are consistently
protected. Data-centric synchronization is an
alternative approach that offloads some of the work on
the language implementation. Data-centric
synchronization groups fields of objects into atomic
sets to indicate that these fields must always be
updated atomically. Each atomic set has associated
units of work, that is, code fragments that preserve
the consistency of that atomic set. Synchronization
operations are added automatically by the compiler. We
present an extension to the Java programming language
that integrates annotations for data-centric
concurrency control. The resulting language, called AJ,
relies on a type system that enables separate
compilation and supports atomic sets that span multiple
objects and that also supports full encapsulation for
more efficient code generation. We evaluate our
proposal by refactoring classes from standard
libraries, as well as a number of multithreaded
benchmarks, to use atomic sets. Our results suggest
that data-centric synchronization is easy to use and
enjoys low annotation overhead, while successfully
preventing data races. Moreover, experiments on the
SPECjbb benchmark suggest that acceptable performance
can be achieved with a modest amount of tuning.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen
M. Blackburn and Kathryn S. McKinley",
title = "Looking back on the language and hardware revolutions:
measured power, performance, and scaling",
journal = j-SIGPLAN,
volume = "47",
number = "4",
pages = "319--332",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1145/2248487.1950402",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Jun 7 08:15:03 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This paper reports and analyzes measured chip power
and performance on five process technology generations
executing 61 diverse benchmarks with a rigorous
methodology. We measure representative Intel IA32
processors with technologies ranging from 130nm to 32nm
while they execute sequential and parallel benchmarks
written in native and managed languages. During this
period, hardware and software changed substantially:
(1) hardware vendors delivered chip multiprocessors
instead of uniprocessors, and independently (2)
software developers increasingly chose managed
languages instead of native languages. This
quantitative data reveals the extent of some known and
previously unobserved hardware and software trends. Two
themes emerge. (I) Workload: The power, performance,
and energy trends of native workloads do not
approximate managed workloads. For example, (a) the
SPEC CPU2006 native benchmarks on the i7 (45) and i5
(32) draw significantly less power than managed or
scalable native benchmarks; and (b) managed runtimes
exploit parallelism even when running single-threaded
applications. The results recommend architects always
include native and managed workloads when designing and
evaluating energy efficient hardware. (II)
Architecture: Clock scaling, microarchitecture,
simultaneous multithreading, and chip multiprocessors
each elicit a huge variety of power, performance, and
energy responses. This variety and the difficulty of
obtaining power measurements recommends exposing
on-chip power meters and when possible structure
specific power meters for cores, caches, and other
structures. Just as hardware event counters provide a
quantitative grounding for performance innovations,
power meters are necessary for optimizing energy.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '12 conference proceedings.",
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Probabilistic modeling for job symbiosis scheduling on
{SMT} processors",
journal = j-TACO,
volume = "9",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207223",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Symbiotic job scheduling improves simultaneous
multithreading (SMT) processor performance by
coscheduling jobs that have ``compatible'' demands on
the processor's shared resources. Existing approaches
however require a sampling phase, evaluate a limited
number of possible coschedules, use heuristics to gauge
symbiosis, are rigid in their optimization target, and
do not preserve system-level priorities/shares. This
article proposes probabilistic job symbiosis modeling,
which predicts whether jobs will create positive or
negative symbiosis when coscheduled without requiring
the coschedule to be evaluated. The model, which uses
per-thread cycle stacks computed through a previously
proposed cycle accounting architecture, is simple
enough to be used in system software. Probabilistic job
symbiosis modeling provides six key innovations over
prior work in symbiotic job scheduling: (i) it does not
require a sampling phase, (ii) it readjusts the job
coschedule continuously, (iii) it evaluates a large
number of possible coschedules at very low overhead,
(iv) it is not driven by heuristics, (v) it can
optimize a performance target of interest (e.g., system
throughput or job turnaround time), and (vi) it
preserves system-level priorities/shares. These
innovations make symbiotic job scheduling both
practical and effective. Our experimental evaluation,
which assumes a realistic scenario in which jobs come
and go, reports an average 16\% (and up to 35\%)
reduction in job turnaround time compared to the
previously proposed SOS (sample, optimize, symbios)
approach for a two-thread SMT processor, and an average
19\% (and up to 45\%) reduction in job turnaround time
for a four-thread SMT processor.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Azadeh Farzan and Zachary Kincaid",
title = "Verification of parameterized concurrent programs by
modular reasoning about data and control",
journal = j-SIGPLAN,
volume = "47",
number = "1",
pages = "297--308",
month = jan,
year = "2012",
DOI = "https://doi.org/10.1145/2103621.2103693",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Mar 15 18:16:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "In this paper, we consider the problem of verifying
thread-state properties of multithreaded programs in
which the number of active threads cannot be statically
bounded. Our approach is based on decomposing the task
into two modules, where one reasons about data and the
other reasons about control. The data module computes
thread-state invariants (e.g., linear constraints over
global variables and local variables of one thread)
using the thread interference information computed by
the control module. The control module computes a
representation of thread interference, as an
incrementally constructed data flow graph, using the
data invariants provided by the data module. These
invariants are used to rule out patterns of thread
interference that can not occur in a real program
execution. The two modules are incorporated into a
feedback loop, so that the abstractions of data and
interference are iteratively coarsened as the algorithm
progresses (that is, they become weaker) until a fixed
point is reached. Our approach is sound and
terminating, and applicable to programs with infinite
state (e.g., unbounded integers) and unboundedly many
threads. The verification method presented in this
paper has been implemented into a tool, called Duet. We
demonstrate the effectiveness of our technique by
verifying properties of a selection of Linux device
drivers using Duet, and also compare Duet with previous
work on verification of parameterized Boolean program
using the Boolean abstractions of these drivers.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '12 conference proceedings.",
author = "Adam Foltzer and Abhishek Kulkarni and Rebecca Swords
and Sajith Sasidharan and Eric Jiang and Ryan Newton",
title = "A meta-scheduler for the par-monad: composable
scheduling for the heterogeneous cloud",
journal = j-SIGPLAN,
volume = "47",
number = "9",
pages = "235--246",
month = sep,
year = "2012",
DOI = "https://doi.org/10.1145/2398856.2364562",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Nov 15 16:40:19 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Modern parallel computing hardware demands
increasingly specialized attention to the details of
scheduling and load balancing across heterogeneous
execution resources that may include GPU and cloud
environments, in addition to traditional CPUs. Many
existing solutions address the challenges of particular
resources, but do so in isolation, and in general do
not compose within larger systems. We propose a
general, composable abstraction for execution
resources, along with a continuation-based
meta-scheduler that harnesses those resources in the
context of a deterministic parallel programming library
for Haskell. We demonstrate performance benefits of
combined CPU/GPU scheduling over either alone, and of
combined multithreaded/distributed scheduling over
existing distributed programming approaches for
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ICFP '12 conference proceedings.",
author = "Michael Garland and Manjunath Kudlur and Yili Zheng",
title = "Designing a unified programming model for
heterogeneous machines",
crossref = "Hollingsworth:2012:SPI",
pages = "67:1--67:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a064.pdf",
abstract = "While high-efficiency machines are increasingly
embracing heterogeneous architectures and massive
multithreading, contemporary mainstream programming
languages reflect a mental model in which processing
elements are homogeneous, concurrency is limited, and
memory is a flat undifferentiated pool of storage.
Moreover, the current state of the art in programming
heterogeneous machines tends towards using separate
programming models, such as OpenMP and CUDA, for
different portions of the machine. Both of these
factors make programming emerging heterogeneous
machines unnecessarily difficult. We describe the
design of the Phalanx programming model, which seeks to
provide a unified programming model for heterogeneous
machines. It provides constructs for bulk parallelism,
synchronization, and data placement which operate
across the entire machine. Our prototype implementation
is able to launch and coordinate work on both CPU and
GPU processors within a single node, and by leveraging
the GASNet runtime, is able to run across all the nodes
of a distributed-memory machine.",
acknowledgement = ack-nhfb,
articleno = "67",
author = "Mark Gebhart and Daniel R. Johnson and David Tarjan
and Stephen W. Keckler and William J. Dally and Erik
Lindholm and Kevin Skadron",
title = "A Hierarchical Thread Scheduler and Register File for
Energy-Efficient Throughput Processors",
journal = j-TOCS,
volume = "30",
number = "2",
pages = "8:1--8:??",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1145/2166879.2166882",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Fri Apr 27 12:10:22 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "Modern graphics processing units (GPUs) employ a large
number of hardware threads to hide both function unit
and memory access latency. Extreme multithreading
requires a complex thread scheduler as well as a large
register file, which is expensive to access both in
terms of energy and latency. We present two
complementary techniques for reducing energy on
massively-threaded processors such as GPUs. First, we
investigate a two-level thread scheduler that maintains
a small set of active threads to hide ALU and local
memory access latency and a larger set of pending
threads to hide main memory latency. Reducing the
number of threads that the scheduler must consider each
cycle improves the scheduler's energy efficiency.
Second, we propose replacing the monolithic register
file found on modern designs with a hierarchical
register file. We explore various trade-offs for the
hierarchy including the number of levels in the
hierarchy and the number of entries at each level. We
consider both a hardware-managed caching scheme and a
software-managed scheme, where the compiler is
responsible for orchestrating all data movement within
the register file hierarchy. Combined with a
hierarchical register file, our two-level thread
scheduler provides a further reduction in energy by
only allocating entries in the upper levels of the
register file hierarchy for active threads. Averaging
across a variety of real world graphics and compute
workloads, the active thread count can be reduced by a
factor of 4 with minimal impact on performance and our
most efficient three-level software-managed register
file hierarchy reduces register file energy by 54\%.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
author = "Sergey Grebenshchikov and Nuno P. Lopes and Corneliu
Popeea and Andrey Rybalchenko",
title = "Synthesizing software verifiers from proof rules",
journal = j-SIGPLAN,
volume = "47",
number = "6",
pages = "405--416",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2345156.2254112",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:49 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PLDI '12 proceedings.",
abstract = "Automatically generated tools can significantly
improve programmer productivity. For example, parsers
and dataflow analyzers can be automatically generated
from declarative specifications in the form of
grammars, which tremendously simplifies the task of
implementing a compiler. In this paper, we present a
method for the automatic synthesis of software
verification tools. Our synthesis procedure takes as
input a description of the employed proof rule, e.g.,
program safety checking via inductive invariants, and
produces a tool that automatically discovers the
auxiliary assertions required by the proof rule, e.g.,
inductive loop invariants and procedure summaries. We
rely on a (standard) representation of proof rules
using recursive equations over the auxiliary
assertions. The discovery of auxiliary assertions,
i.e., solving the equations, is based on an iterative
process that extrapolates solutions obtained for
finitary unrollings of equations. We show how our
method synthesizes automatic safety and liveness
verifiers for programs with procedures, multi-threaded
programs, and functional programs. Our experimental
comparison of the resulting verifiers with existing
state-of-the-art verification tools confirms the
practicality of the approach.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Mahantesh Halappanavar and John Feo and Oreste Villa
and Antonino Tumeo and Alex Pothen",
title = "Approximate weighted matching on emerging manycore and
multithreaded architectures",
journal = j-IJHPCA,
volume = "26",
number = "4",
pages = "413--430",
month = nov,
year = "2012",
DOI = "https://doi.org/10.1177/1094342012452893",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Thu Nov 8 11:31:16 MST 2012",
bibsource = "http://hpc.sagepub.com/content/26/4.toc;
URL = "http://hpc.sagepub.com/content/26/4/413.full.pdf+html",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Performance Computing
journal-URL = "http://hpc.sagepub.com/content/by/year",
onlinedate = "August 9, 2012",
author = "Christopher M. Hayden and Edward K. Smith and Michail
Denchev and Michael Hicks and Jeffrey S. Foster",
title = "{Kitsune}: efficient, general-purpose dynamic software
updating for {C}",
journal = j-SIGPLAN,
volume = "47",
number = "10",
pages = "249--264",
month = oct,
year = "2012",
DOI = "https://doi.org/10.1145/2398857.2384635",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Nov 15 16:40:23 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Dynamic software updating (DSU) systems allow programs
to be updated while running, thereby permitting
developers to add features and fix bugs without
downtime. This paper introduces Kitsune, a new DSU
system for C whose design has three notable features.
First, Kitsune's updating mechanism updates the whole
program, not individual functions. This mechanism is
more flexible than most prior approaches and places no
restrictions on data representations or allowed
compiler optimizations. Second, Kitsune makes the
important aspects of updating explicit in the program
text, making the program's semantics easy to understand
while minimizing programmer effort. Finally, the
programmer can write simple specifications to direct
Kitsune to generate code that traverses and transforms
old-version state for use by new code; such state
transformation is often necessary, and is significantly
more difficult in prior DSU systems. We have used
Kitsune to update five popular, open-source, single-
and multi-threaded programs, and find that few program
changes are required to use Kitsune, and that it incurs
essentially no performance overhead.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '12 conference proceedings.",
author = "Jeff Huang and Charles Zhang",
title = "Execution privatization for scheduler-oblivious
concurrent programs",
journal = j-SIGPLAN,
volume = "47",
number = "10",
pages = "737--752",
month = oct,
year = "2012",
DOI = "https://doi.org/10.1145/2398857.2384670",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Nov 15 16:40:23 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Making multithreaded execution less non-deterministic
is a promising solution to address the difficulty of
concurrent programming plagued by the non-deterministic
thread scheduling. In fact, a vast category of
concurrent programs are scheduler-oblivious: their
execution is deterministic, regardless of the
scheduling behavior. We present and formally prove a
fundamental observation of the privatizability property
for scheduler-oblivious programs, that paves the
theoretical foundation for privatizing shared data
accesses on a path segment. With privatization, the
non-deterministic thread interleavings on the
privatized accesses are isolated and as the consequence
many concurrency problems are alleviated. We further
present a path and context sensitive privatization
algorithm that safely privatizes the program without
introducing any additional program behavior. Our
evaluation results show that the privatization
opportunity pervasively exists in real world large
complex concurrent systems. Through privatization,
several real concurrency bugs are fixed and notable
performance improvements are also achieved on
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '12 conference proceedings.",
author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
and Yale N. Patt",
title = "Bottleneck identification and scheduling in
multithreaded applications",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "1",
pages = "223--234",
month = mar,
year = "2012",
DOI = "https://doi.org/10.1145/2189750.2151001",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 1 17:06:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ASPLOS '12 conference proceedings.",
abstract = "Performance of multithreaded applications is limited
by a variety of bottlenecks, e.g. critical sections,
barriers and slow pipeline stages. These bottlenecks
serialize execution, waste valuable execution cycles,
and limit scalability of applications. This paper
proposes Bottleneck Identification and Scheduling in
Multithreaded Applications (BIS), a cooperative
software-hardware mechanism to identify and accelerate
the most critical bottlenecks. BIS identifies which
bottlenecks are likely to reduce performance by
measuring the number of cycles threads have to wait for
each bottleneck, and accelerates those bottlenecks
using one or more fast cores on an Asymmetric Chip
Multi-Processor (ACMP). Unlike previous work that
targets specific bottlenecks, BIS can identify and
accelerate bottlenecks regardless of their type. We
compare BIS to four previous approaches and show that
it outperforms the best of them by 15\% on average.
BIS' performance improvement increases as the number of
cores and the number of fast cores in the system
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Pramod G. Joisha and Robert S. Schreiber and
Prithviraj Banerjee and Hans-J. Boehm and Dhruva R.
title = "On a Technique for Transparently Empowering Classical
Compiler Optimizations on Multithreaded Code",
journal = j-TOPLAS,
volume = "34",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2220365.2220368",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jun 29 17:33:40 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "A large body of data-flow analyses exists for
analyzing and optimizing sequential code.
Unfortunately, much of it cannot be directly applied on
parallel code, for reasons of correctness. This article
presents a technique to automatically, aggressively,
yet safely apply sequentially-sound data-flow
transformations, without change, on shared-memory
programs. The technique is founded on the notion of
program references being ``siloed'' on certain
control-flow paths. Intuitively, siloed references are
free of interference from other threads within the
confines of such paths. Data-flow transformations can,
in general, be unblocked on siloed references. The
solution has been implemented in a widely used
compiler. Results on benchmarks from SPLASH-2 show that
performance improvements of up to 41\% are possible,
with an average improvement of 6\% across all the
tested programs over all thread counts.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Melanie Kambadur and Kui Tang and Martha A. Kim",
title = "{Harmony}: collection and analysis of parallel block
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "452--463",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337211",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Efficient execution of well-parallelized applications
is central to performance in the multicore era. Program
analysis tools support the hardware and software sides
of this effort by exposing relevant features of
multithreaded applications. This paper describes
parallel block vectors, which uncover previously unseen
characteristics of parallel programs. Parallel block
vectors provide block execution profiles per
concurrency phase (e.g., the block execution profile of
all serial regions of a program). This information
provides a direct and fine-grained mapping between an
application's runtime parallel phases and the static
code that makes up those phases. This paper also
demonstrates how to collect parallel block vectors with
minimal application perturbation using Harmony. Harmony
is an instrumentation pass for the LLVM compiler that
introduces just 16-21\% overhead on average across
eight Parsec benchmarks. We apply parallel block
vectors to uncover several novel insights about
parallel applications with direct consequences for
architectural design. First, that the serial and
parallel phases of execution used in Amdahl's Law are
often composed of many of the same basic blocks.
Second, that program features, such as instruction mix,
vary based on the degree of parallelism, with serial
phases in particular displaying different instruction
mixes from the program as a whole. Third, that dynamic
execution frequencies do not necessarily correlate with
a block's parallelism.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Ming Kawaguchi and Patrick Rondon and Alexander Bakst
and Ranjit Jhala",
title = "Deterministic parallelism via liquid effects",
journal = j-SIGPLAN,
volume = "47",
number = "6",
pages = "45--54",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2345156.2254071",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:49 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PLDI '12 proceedings.",
abstract = "Shared memory multithreading is a popular approach to
parallel programming, but also fiendishly hard to get
right. We present Liquid Effects, a type-and-effect
system based on refinement types which allows for
fine-grained, low-level, shared memory multi-threading
while statically guaranteeing that a program is
deterministic. Liquid Effects records the effect of an
expression as a for- mula in first-order logic, making
our type-and-effect system highly expressive. Further,
effects like Read and Write are recorded in Liquid
Effects as ordinary uninterpreted predicates, leaving
the effect system open to extension by the user. By
building our system as an extension to an existing
dependent refinement type system, our system gains
precise value- and branch-sensitive reasoning about
effects. Finally, our system exploits the Liquid Types
refinement type inference technique to automatically
infer refinement types and effects. We have implemented
our type-and-effect checking techniques in CSOLVE, a
refinement type inference system for C programs. We
demonstrate how CSOLVE uses Liquid Effects to prove the
determinism of a variety of benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Stephen W. Keckler and Steven K. Reinhardt",
title = "Massively Multithreaded Computing Systems",
journal = j-COMPUTER,
volume = "45",
number = "8",
pages = "24--25",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1109/MC.2012.270",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Wed Aug 29 16:38:07 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Arif M. Khan and David F. Gleich and Alex Pothen and
Mahantesh Halappanavar",
title = "A multithreaded algorithm for network alignment via
approximate matching",
crossref = "Hollingsworth:2012:SPI",
pages = "64:1--64:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a054.pdf",
abstract = "Network alignment is an optimization problem to find
the best one-to-one map between the vertices of a pair
of graphs that overlaps as many edges as possible. It
is a relaxation of the graph isomorphism problem and is
closely related to the subgraph isomorphism problem.
The best current approaches are entirely heuristic and
iterative in nature. They generate real-valued
heuristic weights that must be rounded to find integer
solutions. This rounding requires solving a bipartite
maximum weight matching problem at each iteration in
order to avoid missing high quality solutions. We
investigate substituting a parallel, half-approximation
for maximum weight matching instead of an exact
computation. Our experiments show that the resulting
difference in solution quality is negligible. We
demonstrate almost a 20-fold speedup using 40 threads
on an 8 processor Intel Xeon E7-8870 system and now
solve real-world problems in 36 seconds instead of 10
acknowledgement = ack-nhfb,
articleno = "64",
author = "Artem Khyzha and Pavel Par{\'\i}zek and Corina S.
title = "Abstract pathfinder",
journal = j-SIGSOFT,
volume = "37",
number = "6",
pages = "1--5",
month = nov,
year = "2012",
DOI = "https://doi.org/10.1145/2382756.2382794",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:16:18 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "We present Abstract Pathfinder, an extension to the
Java Pathfinder (JPF) verification tool-set that
supports data abstraction to reduce the large data
domains of a Java program to small, finite abstract
domains, making the program more amenable to
verification. We use data abstraction to compute an
over-approximation of the original program in such a
way that if a (safety) property is true in the
abstracted program the property is also true in the
original program. Our approach enhances JPF with an
abstract interpreter and abstract state-matching
mechanisms, together with a library of abstractions
from which the user can pick which abstractions to use
for a particular application. We discuss the details of
our implementation together with some preliminary
experiments with analyzing multi-threaded Java
programs, where Abstract Pathfinder achieves
significant time and memory savings as compared with
plain JPF.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Stephen Kyle and Igor B{\"o}hm and Bj{\"o}rn Franke
and Hugh Leather and Nigel Topham",
title = "Efficiently parallelizing instruction set simulation
of embedded multi-core processors using region-based
just-in-time dynamic binary translation",
journal = j-SIGPLAN,
volume = "47",
number = "5",
pages = "21--30",
month = may,
year = "2012",
DOI = "https://doi.org/10.1145/2345141.2248422",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "LCTES '12 proceedings.",
abstract = "Embedded systems, as typified by modern mobile phones,
are already seeing a drive toward using multi-core
processors. The number of cores will likely increase
rapidly in the future. Engineers and researchers need
to be able to simulate systems, as they are expected to
be in a few generations time, running simulations of
many-core devices on today's multi-core machines. These
requirements place heavy demands on the scalability of
simulation engines, the fastest of which have typically
evolved from just-in-time (Jit) dynamic binary
translators (Dbt). Existing work aimed at parallelizing
Dbt simulators has focused exclusively on trace-based
Dbt, wherein linear execution traces or perhaps trees
thereof are the units of translation. Region-based Dbt
simulators have not received the same attention and
require different techniques than their trace-based
cousins. In this paper we develop an innovative
approach to scaling multi-core, embedded simulation
through region-based Dbt. We initially modify the Jit
code generator of such a simulator to emit code that
does not depend on a particular thread with its
thread-specific context and is, therefore,
thread-agnostic. We then demonstrate that this
thread-agnostic code generation is comparable to
thread-specific code with respect to performance, but
also enables the sharing of JIT-compiled regions
between different threads. This sharing optimisation,
in turn, leads to significant performance improvements
for multi-threaded applications. In fact, our results
confirm that an average of 76\% of all JIT-compiled
regions can be shared between 128 threads in
representative, parallel workloads. We demonstrate that
this translates into an overall performance improvement
by 1.44x on average and up to 2.40x across 12
multi-threaded benchmarks taken from the Splash-2
benchmark suite, targeting our high-performance
multi-core Dbt simulator for embedded Arc processors
running on a 4-core Intel host machine.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
Kim and Jinwoo Shin",
title = "{DRAM} Scheduling Policy for {GPGPU} Architectures
Based on a Potential Function",
volume = "11",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "GPGPU architectures (applications) have several
different characteristics compared to traditional CPU
architectures (applications): highly multithreaded
architectures and SIMD-execution behavior are the two
important characteristics of GPGPU computing. In this
paper, we propose a potential function that models the
DRAM behavior in GPGPU architectures and a DRAM
scheduling policy, alpha-SJF policy to minimize the
potential function. The scheduling policy essentially
chooses between SJF and FR-FCFS at run-time based on
the number of requests from each thread and whether the
thread has a row buffer hit.",
acknowledgement = ack-nhfb,
affiliation = "Lakshminarayana, NB (Reprint Author), Georgia Inst
Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
Atlanta, GA 30332 USA.",
author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "DRAM scheduling; GPGPU; Potential function",
number-of-cited-references = "5",
research-areas = "Computer Science",
researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
times-cited = "7",
unique-id = "Lakshminarayana:2012:DSP",
web-of-science-categories = "Computer Science, Hardware \&
author = "Charles E. Leiserson and Tao B. Schardl and Jim
title = "Deterministic parallel random-number generation for
dynamic-multithreading platforms",
journal = j-SIGPLAN,
volume = "47",
number = "8",
pages = "193--204",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1145/2370036.2145841",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Sep 12 12:11:57 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PPOPP '12 conference proceedings.",
abstract = "Existing concurrency platforms for dynamic
multithreading do not provide repeatable parallel
random-number generators. This paper proposes that a
mechanism called pedigrees be built into the runtime
system to enable efficient deterministic parallel
random-number generation. Experiments with the
open-source MIT Cilk runtime system show that the
overhead for maintaining pedigrees is negligible.
Specifically, on a suite of 10 benchmarks, the relative
overhead of Cilk with pedigrees to the original Cilk
has a geometric mean of less than 1\%. We persuaded
Intel to modify its commercial C/C++ compiler, which
provides the Cilk Plus concurrency platform, to include
pedigrees, and we built a library implementation of a
deterministic parallel random-number generator called
DotMix that compresses the pedigree and then
``RC6-mixes'' the result. The statistical quality of
DotMix is comparable to that of the popular Mersenne
twister, but somewhat slower than a nondeterministic
parallel version of this efficient and high-quality
serial random-number generator. The cost of calling
DotMix depends on the ``spawn depth'' of the
invocation. For a naive Fibonacci calculation with n=40
that calls DotMix in every node of the computation,
this ``price of determinism'' is a factor of 2.65 in
running time, but for more realistic applications with
less intense use of random numbers --- such as a
maximal-independent-set algorithm, a practical
samplesort program, and a Monte Carlo discrete-hedging
application from QuantLib --- the observed ``price''
was less than 5\%. Moreover, even if overheads were
several times greater, applications using DotMix should
be amply fast for debugging purposes, which is a major
reason for desiring repeatability.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Xin Li and Reinhard von Hanxleden",
title = "Multithreaded Reactive Programming --- the {Kiel
Esterel} Processor",
journal = j-IEEE-TRANS-COMPUT,
volume = "61",
number = "3",
pages = "337--349",
month = mar,
year = "2012",
DOI = "https://doi.org/10.1109/TC.2010.246",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Fri Feb 3 07:35:03 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada",
title = "High performance phylogenetic analysis on
{CUDA}-compatible {GPUs}",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "5",
pages = "52--57",
month = dec,
year = "2012",
DOI = "https://doi.org/10.1145/2460216.2460226",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sun May 5 09:49:56 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
note = "HEART '12 conference proceedings.",
abstract = "The operation of phylogenetic analysis aims to
investigate the evolution and relationships among
species. It is widely used in the fields of system
biology and comparative genomics. However, phylogenetic
analysis is also a computationally intensive operation
as the number of tree topology grows in a factorial way
with the number of species involved. Therefore, due to
the large number of species in the real world, the
computational burden has largely thwarted phylogenetic
reconstruction. In this paper, we describe the detailed
GPU-based multi-threaded design and implementation of a
Markov Chain Monte Carlo (MCMC) maximum likelihood
algorithm for phylogenetic analysis on a set of aligned
nucleotide sequences. The implementation is based on
the framework of the most widely used phylogenetic
analysis tool, namely MrBayes. The proposed approach
resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX
GPU compared to an optimized GPP-based software
implementation running on a desktop computer with a
single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Gu Liu and Hong An and Wenting Han and Xiaoqiang Li
and Tao Sun and Wei Zhou and Xuechao Wei and Xulong
title = "{FlexBFS}: a parallelism-aware implementation of
breadth-first search on {GPU}",
journal = j-SIGPLAN,
volume = "47",
number = "8",
pages = "279--280",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1145/2370036.2145853",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Sep 12 12:11:57 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "PPOPP '12 conference proceedings.",
abstract = "In this paper, we present FlexBFS, a parallelism-aware
implementation for breadth-first search on GPU. Our
implementation can adjust the computation resources
according to the feedback of available parallelism
dynamically. We also optimized our program in three
ways: (1)a simplified two-level queue management,(2)a
combined kernel strategy and (3)a high-degree vertices
specialization approach. Our experimental results show
that it can achieve 3~20 times speedup against the
fastest serial version, and can outperform the TBB
based multi-threading CPU version and the previous most
effective GPU version on all types of input graphs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Jason Mars and Naveen Kumar",
title = "{BlockChop}: dynamic squash elimination for hybrid
processor architecture",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "536--547",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337221",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Hybrid processors are HW/SW co-designed processors
that leverage blocked-execution, the execution of
regions of instructions as atomic blocks, to facilitate
aggressive speculative optimization. As we move to a
multicore hybrid design, fine grained conflicts for
shared data can violate the atomicity requirement of
these blocks and lead to expensive squashes and
rollbacks. However, as these atomic regions differ from
those used in checkpointing and transactional memory
systems, the extent of this potentially prohibitive
problem remains unclear, and mechanisms to mitigate
these squashes dynamically may be critical to enable a
highly per-formant multicore hybrid design. In this
work, we investigate how multithreaded applications,
both benchmark and commercial workloads, are affected
by squashes, and present dynamic mechanisms for
mitigating these squashes in hybrid processors. While
the current wisdom is that there is not a significant
number of squashes for smaller atomic regions, we
observe this is not the case for many multithreaded
workloads. With region sizes of just 200--500
instructions, we observe a performance degradation
ranging from 10\% to more than 50\% for workloads with
a mixture of shared reads and writes. By harnessing the
unique flexibility provided by the software subsystem
of hybrid processor design, we present BlockChop, a
framework for dynamically mitigating squashes on
multicore hybrid processors. We present a range of
squash handling mechanisms leveraging retrials,
interpretation, and retranslation, and find that
BlockChop is quite effective. Over the current response
to exceptions and squashes in a hybrid design, we are
able to improve the performance of benchmark and
commercial workloads by 1.4x and 1.2x on average for
large and small region sizes respectively.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Timothy N. Miller and Renji Thomas and Xiang Pan and
Radu Teodorescu",
title = "{VRSync}: characterizing and eliminating
synchronization-induced voltage emergencies in
many-core processors",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "249--260",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337188",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Power consumption is a primary concern for
microprocessor designers. Lowering the supply voltage
of processors is one of the most effective techniques
for improving their energy efficiency. Unfortunately,
low-voltage operation faces multiple challenges going
forward. One such challenge is increased sensitivity to
voltage fluctuations, which can trigger so-called
``voltage emergencies'' that can lead to errors. These
fluctuations are caused by abrupt changes in power
demand, triggered by processor activity variation as a
function of workload. This paper examines the effects
of voltage fluctuations on future many-core processors.
With the increase in the number of cores in a chip, the
effects of chip-wide activity fluctuation --- such as
that caused by global synchronization in multithreaded
applications --- overshadow the effects of core-level
workload variability. Starting from this observation,
we developed VRSync, a novel synchronization
methodology that uses emergency-aware scheduling
policies that reduce the slope of load fluctuations,
eliminating emergencies. We show that VRSync is very
effective at eliminating emergencies, allowing voltage
guardbands to be significantly lowered, which reduces
energy consumption by an average of 33\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Santosh Nagarakatte and Sebastian Burckhardt and Milo
M. K. Martin and Madanlal Musuvathi",
title = "Multicore acceleration of priority-based schedulers
for concurrency bug detection",
journal = j-SIGPLAN,
volume = "47",
number = "6",
pages = "543--554",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2345156.2254128",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:49 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PLDI '12 proceedings.",
abstract = "Testing multithreaded programs is difficult as threads
can interleave in a nondeterministic fashion. Untested
interleavings can cause failures, but testing all
interleavings is infeasible. Many interleaving
exploration strategies for bug detection have been
proposed, but their relative effectiveness and
performance remains unclear as they often lack publicly
available implementations and have not been evaluated
using common benchmarks. We describe NeedlePoint, an
open-source framework that allows selection and
comparison of a wide range of interleaving exploration
policies for bug detection proposed by prior work. Our
experience with NeedlePoint indicates that
priority-based probabilistic concurrency testing (the
PCT algorithm) finds bugs quickly, but it runs only one
thread at a time, which destroys parallelism by
serializing executions. To address this problem we
propose a parallel version of the PCT algorithm~(PPCT).
We show that the new algorithm outperforms the original
by a factor of 5x when testing parallel programs on an
eight-core machine. We formally prove that parallel PCT
provides the same probabilistic coverage guarantees as
PCT. Moreover, PPCT is the first algorithm that runs
multiple threads while providing coverage guarantees.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Rahul Nagpal and Anasua Bhowmik",
title = "Criticality guided energy aware speculation for
speculative multithreaded processors",
volume = "38",
number = "6--7",
pages = "329--341",
month = jun # "\slash " # jul,
year = "2012",
DOI = "https://doi.org/10.1016/j.parco.2012.03.002",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Sun May 20 09:14:24 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167819112000191",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Doohwan Oh and Won W. Ro",
title = "Multi-Threading and Suffix Grouping on Massive
Multiple Pattern Matching Algorithm",
journal = j-COMP-J,
volume = "55",
number = "11",
pages = "1331--1346",
month = nov,
year = "2012",
DOI = "https://doi.org/10.1093/comjnl/bxs002",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Thu Nov 1 11:25:36 MDT 2012",
bibsource = "http://comjnl.oxfordjournals.org/content/55/11.toc;
URL = "http://comjnl.oxfordjournals.org/content/55/11/1331.full.pdf+html",
acknowledgement = ack-nhfb,
fjournal = "The Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
onlinedate = "February 2, 2012",
author = "Stephen L. Olivier and Bronis R. de Supinski and
Martin Schulz and Jan F. Prins",
title = "Characterizing and mitigating work time inflation in
task parallel programs",
crossref = "Hollingsworth:2012:SPI",
pages = "65:1--65:12",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a066.pdf",
abstract = "Task parallelism raises the level of abstraction in
shared memory parallel programming to simplify the
development of complex applications. However, task
parallel applications can exhibit poor performance due
to thread idleness, scheduling overheads, and work time
inflation --- additional time spent by threads in a
multithreaded computation beyond the time required to
perform the same work in a sequential computation. We
identify the contributions of each factor to lost
efficiency in various task parallel OpenMP applications
and diagnose the causes of work time inflation in those
applications. Increased data access latency can cause
significant work time inflation in NUMA systems. Our
locality framework for task parallel OpenMP programs
mitigates this cause of work time inflation. Our
extensions to the Qthreads library demonstrate that
locality-aware scheduling can improve performance up to
3X compared to the Intel OpenMP task scheduler.",
acknowledgement = ack-nhfb,
articleno = "65",
author = "Robert Preissl and Theodore M. Wong and Pallab Datta
and Myron Flickner and Raghavendra Singh and Steven K.
Esser and William P. Risk and Horst D. Simon and
Dharmendra S. Modha",
title = "{Compass}: a scalable simulator for an architecture
for cognitive computing",
crossref = "Hollingsworth:2012:SPI",
pages = "54:1--54:11",
year = "2012",
bibdate = "Thu Nov 15 07:38:35 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conferences.computer.org/sc/2012/papers/1000a085.pdf",
abstract = "Inspired by the function, power, and volume of the
organic brain, we are developing TrueNorth, a novel
modular, non-von Neumann, ultra-low power, compact
architecture. TrueNorth consists of a scalable network
of neurosynaptic cores, with each core containing
neurons, dendrites, synapses, and axons. To set sail
for TrueNorth, we developed Compass, a multi-threaded,
massively parallel functional simulator and a parallel
compiler that maps a network of long-distance pathways
in the macaque monkey brain to TrueNorth. We
demonstrate near-perfect weak scaling on a 16 rack
IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB
memory), achieving an unprecedented scale of 256
million neurosynaptic cores containing 65 billion
neurons and 16 trillion synapses running only 388X
slower than real time with an average spiking rate of
8.1 Hz. By using emerging PGAS communication
primitives, we also demonstrate 2X better real-time
performance over MPI primitives on a 4 rack Blue Gene/P
(16384 CPUs, 16 TB memory).",
acknowledgement = ack-nhfb,
articleno = "54",
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
title = "Thread Tranquilizer: Dynamically reducing performance
journal = j-TACO,
volume = "8",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086725",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "To realize the performance potential of multicore
systems, we must effectively manage the interactions
between memory reference behavior and the operating
system policies for thread scheduling and migration
decisions. We observe that these interactions lead to
significant variations in the performance of a given
application, from one execution to the next, even when
the program input remains unchanged and no other
applications are being run on the system. Our
experiments with multithreaded programs, including the
TATP database application, SPECjbb2005, and a subset of
PARSEC and SPEC OMP programs, on a 24-core Dell
PowerEdge R905 server running OpenSolaris confirms the
above observation.",
acknowledgement = ack-nhfb,
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Gregorio Quintana-Ort{\'\i} and Francisco D. Igual and
Mercedes Marqu{\'e}s and Enrique S. Quintana-Ort{\'\i}
and Robert A. van de Geijn",
title = "A Runtime System for Programming Out-of-Core Matrix
Algorithms-by-Tiles on Multithreaded Architectures",
journal = j-TOMS,
volume = "38",
number = "4",
pages = "25:1--25:25",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1145/2331130.2331133",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Thu Aug 30 18:55:10 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Out-of-core implementations of algorithms for dense
matrix computations have traditionally focused on
optimal use of memory so as to minimize I/O, often
trading programmability for performance. In this
article we show how the current state of hardware and
software allows the programmability problem to be
addressed without sacrificing performance. This comes
from the realizations that memory is cheap and large,
making it less necessary to optimally orchestrate I/O,
and that new algorithms view matrices as collections of
submatrices and computation as operations with those
submatrices. This enables libraries to be coded at a
high level of abstraction, leaving the tasks of
scheduling the computations and data movement in the
hands of a runtime system. This is in sharp contrast to
more traditional approaches that leverage optimal use
of in-core memory and, at the expense of introducing
considerable programming complexity, explicit overlap
of I/O with computation. Performance is demonstrated
for this approach on multicore architectures as well as
platforms equipped with hardware accelerators.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
Francisco J. Cazorla",
title = "On the evaluation of the impact of shared resources in
multithreaded {COTS} processors in time-critical
journal = j-TACO,
volume = "8",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Commercial Off-The-Shelf (COTS) processors are now
commonly used in real-time embedded systems. The
characteristics of these processors fulfill system
requirements in terms of time-to-market, low cost, and
high performance-per-watt ratio. However, multithreaded
(MT) processors are still not widely used in real-time
systems because the timing analysis is too complex. In
MT processors, simultaneously-running tasks share and
compete for processor resources, so the timing analysis
has to estimate the possible impact that the inter-task
interferences have on the execution time of the
applications. In this paper, we propose a method that
quantifies the slowdown that simultaneously-running
tasks may experience due to collision in shared
processor resources.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Petar Radojkovi{\'c} and Vladimir Cakarevi{\'c} and
Miquel Moret{\'o} and Javier Verd{\'u} and Alex Pajuelo
and Francisco J. Cazorla and Mario Nemirovsky and Mateo
title = "Optimal task assignment in multithreaded processors: a
statistical approach",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "1",
pages = "235--248",
month = mar,
year = "2012",
DOI = "https://doi.org/10.1145/2189750.2151002",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 1 17:06:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ASPLOS '12 conference proceedings.",
abstract = "The introduction of massively multithreaded (MMT)
processors, comprised of a large number of cores with
many shared resources, has made task scheduling, in
particular task to hardware thread assignment, one of
the most promising ways to improve system performance.
However, finding an optimal task assignment for a
workload running on MMT processors is an NP-complete
problem. Due to the fact that the performance of the
best possible task assignment is unknown, the room for
improvement of current task-assignment algorithms
cannot be determined. This is a major problem for the
industry because it could lead to: (1)~A waste of
resources if excessive effort is devoted to improving a
task assignment algorithm that already provides a
performance that is close to the optimal one, or
(2)~significant performance loss if insufficient effort
is devoted to improving poorly-performing task
assignment algorithms. In this paper, we present a
method based on Extreme Value Theory that allows the
prediction of the performance of the optimal task
assignment in MMT processors. We further show that
executing a sample of several hundred or several
thousand random task assignments is enough to obtain,
with very high confidence, an assignment with a
performance that is close to the optimal one. We
validate our method with an industrial case study for a
set of multithreaded network applications running on an
UltraSPARC~T2 processor.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Sherief Reda and Ryan Cochran and Ayse K. Coskun",
title = "Adaptive Power Capping for Servers with Multithreaded
journal = j-IEEE-MICRO,
volume = "32",
number = "5",
pages = "64--75",
month = sep # "\slash " # oct,
year = "2012",
DOI = "https://doi.org/10.1109/MM.2012.59",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Nov 15 05:59:33 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Maria-Cecilia Rivara and Pedro Rodriguez and Rafael
Montenegro and Gaston Jorquera",
title = "Multithread parallelization of {Lepp}-bisection
journal = j-APPL-NUM-MATH,
volume = "62",
number = "4",
pages = "473--488",
month = apr,
year = "2012",
DOI = "https://doi.org/10.1016/j.apnum.2011.07.011",
ISSN = "0168-9274 (print), 1873-5460 (electronic)",
ISSN-L = "0168-9274",
bibdate = "Thu Mar 8 07:24:47 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/applnummath.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0168927411001292",
acknowledgement = ack-nhfb,
fjournal = "Applied Numerical Mathematics",
journal-URL = "http://www.sciencedirect.com/science/journal/01689274",
author = "Jennfer B. Sartor and Lieven Eeckhout",
title = "Exploring multi-threaded {Java} application
performance on multicore hardware",
journal = j-SIGPLAN,
volume = "47",
number = "10",
pages = "281--296",
month = oct,
year = "2012",
DOI = "https://doi.org/10.1145/2398857.2384638",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Nov 15 16:40:23 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "While there have been many studies of how to schedule
applications to take advantage of increasing numbers of
cores in modern-day multicore processors, few have
focused on multi-threaded managed language applications
which are prevalent from the embedded to the server
domain. Managed languages complicate performance
studies because they have additional virtual machine
threads that collect garbage and dynamically compile,
closely interacting with application threads. Further
complexity is introduced as modern multicore machines
have multiple sockets and dynamic frequency scaling
options, broadening opportunities to reduce both power
and running time. In this paper, we explore the
performance of Java applications, studying how best to
map application and virtual machine (JVM) threads to a
multicore, multi-socket environment. We explore both
the cost of separating JVM threads from application
threads, and the opportunity to speed up or slow down
the clock frequency of isolated threads. We perform
experiments with the multi-threaded DaCapo benchmarks
and pseudojbb2005 running on the Jikes Research Virtual
Machine, on a dual-socket, 8-core Intel Nehalem machine
to reveal several novel, and sometimes
counter-intuitive, findings. We believe these insights
are a first but important step towards understanding
and optimizing managed language performance on modern
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '12 conference proceedings.",
author = "Mageda Sharafeddine and Komal Jothi and Haitham
title = "Disjoint out-of-order execution processor",
journal = j-TACO,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355592",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "High-performance superscalar architectures used to
exploit instruction level parallelism in single-thread
applications have become too complex and power hungry
for the multicore processors era. We propose a new
architecture that uses multiple small latency-tolerant
out-of-order cores to improve single-thread
performance. Improving single-thread performance with
multiple small out-of-order cores allows designers to
place more of these cores on the same die.
Consequently, emerging highly parallel applications can
take full advantage of the multicore parallel hardware
without sacrificing performance of inherently serial
and hard to parallelize applications. Our architecture
combines speculative multithreading (SpMT) with
checkpoint recovery and continual flow pipeline
architectures. It splits single-thread program
execution into disjoint control and data threads that
execute concurrently on multiple cooperating small and
latency-tolerant out-of-order cores. Hence we call this
style of execution Disjoint Out-of-Order Execution
(DOE). DOE uses latency tolerance to overcome
performance issues of SpMT caused by interthread data
dependences. To evaluate this architecture, we have
developed a microarchitecture performance model of DOE
based on PTLSim, a simulation infrastructure of the x86
instruction set architecture. We evaluate the potential
performance of DOE processor architecture using a
simple heuristic to fork control independent threads in
hardware at the target addresses of future procedure
return instructions. Using applications from SpecInt
2000, we study DOE under ideal as well as realistic
architectural constraints. We discuss the performance
impact of key DOE architecture and application
variables such as number of cores, interthread data
dependences, intercore data communication delay,
buffers capacity, and branch mispredictions. Without
any DOE specific compiler optimizations, our results
show that DOE outperforms conventional SpMT
architectures by 15\%, on average. We also show that
DOE with four small cores can perform on average
equally well to a large superscalar core, consuming
about the same power. Most importantly, DOE improves
throughput performance by a significant amount over a
large superscalar core, up to 2.5 times, when running
multitasking applications.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Mahesh Shirole and Rajeev Kumar",
title = "Testing for concurrency in {UML} diagrams",
journal = j-SIGSOFT,
volume = "37",
number = "5",
pages = "1--8",
month = sep,
year = "2012",
DOI = "https://doi.org/10.1145/2347696.2347712",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:16:16 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Concurrent programming is increasingly being used in
many applications with the advent of multi-cores. The
necessary support for execution of multi-threading is
getting richer. Notwithstanding, a concurrent program
may behave nondeterministically, it may result in
different outputs with the same input in different
runs. The aim of this study is to generate test
sequences for concurrency from unified modelling
language (UML) behavioral models such as sequence and
activity diagrams. Generating exhaustive test cases for
all concurrent interleaving sequences is exponential in
size. Therefore, it is necessary to find adequate test
cases in presence of concurrency to uncover errors due
to, e.g., data race, synchronization and deadlocks. In
order to generate adequate test cases a novel search
algorithm, which we call concurrent queue search (CQS)
is proposed. The CQS handles random nature of
concurrent tasks. To generate test scenarios, a
sequence diagram is converted into an activity diagram.
An activity diagram encapsulates sequential,
conditional, iterative and concurrent ows of the
control. By the experimental results, it was observed
that test sequences generated by CQS algorithm are
superior as compared to DFS and BFS search
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Abhayendra Singh and Satish Narayanasamy and Daniel
Marino and Todd Millstein and Madanlal Musuvathi",
title = "End-to-end sequential consistency",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "3",
pages = "524--535",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2366231.2337220",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Sep 6 10:21:07 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ISCA '12 conference proceedings.",
abstract = "Sequential consistency (SC) is arguably the most
intuitive behavior for a shared-memory multithreaded
program. It is widely accepted that language-level SC
could significantly improve programmability of a
multiprocessor system. However, efficiently supporting
end-to-end SC remains a challenge as it requires that
both compiler and hardware optimizations preserve SC
semantics. While a recent study has shown that a
compiler can preserve SC semantics for a small
performance cost, an efficient and complexity-effective
SC hardware remains elusive. Past hardware solutions
relied on aggressive speculation techniques, which has
not yet been realized in a practical implementation.
This paper exploits the observation that hardware need
not enforce any memory model constraints on accesses to
thread-local and shared read-only locations. A
processor can easily determine a large fraction of
these safe accesses with assistance from static
compiler analysis and the hardware memory management
unit. We discuss a low-complexity hardware design that
exploits this information to reduce the overhead in
ensuring SC. Our design employs an additional unordered
store buffer for fast-tracking thread-local stores and
allowing later memory accesses to proceed without a
memory ordering related stall. Our experimental study
shows that the cost of guaranteeing end-to-end SC is
only 6.2\% on average when compared to a system with
TSO hardware executing a stock compiler's output.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Kazutoshi Suito and Rikuhei Ueda and Kei Fujii and
Takuma Kogo and Hiroki Matsutani and Nobuyuki
title = "The Dependable Responsive Multithreaded Processor for
Distributed Real-Time Systems",
journal = j-IEEE-MICRO,
volume = "32",
number = "6",
pages = "52--61",
month = nov # "\slash " # dec,
year = "2012",
DOI = "https://doi.org/10.1109/MM.2012.88",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Dec 13 15:52:22 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Andrei Terechko and Jan Hoogerbrugge and Ghiath Alkadi
and Surendra Guntur and Anirban Lahiri and Marc
Duranton and Clemens W{\"u}st and Phillip Christie and
Axel Nackaerts and Aatish Kumar",
title = "Balancing Programmability and Silicon Efficiency of
Heterogeneous Multicore Architectures",
journal = j-TECS,
volume = "11S",
number = "1",
pages = "14:1--14:??",
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180887.2180890",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jun 7 16:18:52 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Multicore architectures provide scalable performance
with a lower hardware design effort than single core
processors. Our article presents a design methodology
and an embedded multicore architecture, focusing on
reducing the software design complexity and boosting
the performance density. First, we analyze
characteristics of the Task-Level Parallelism in modern
multimedia workloads. These characteristics are used to
formulate requirements for the programming model. Then
we translate the programming model requirements to an
architecture specification, including a novel
low-complexity implementation of cache coherence and a
hardware synchronization unit. Our evaluation
demonstrates that the novel coherence mechanism
substantially simplifies hardware design, while
reducing the performance by less than 18\% relative to
a complex snooping technique. Compared to a single
processor core, the multicores have already proven to
be more area- and energy-efficient. However, the
multicore architectures in embedded systems still
compete with highly efficient function-specific
hardware accelerators. In this article we identify five
architectural methods to boost performance density of
multicores; microarchitectural downscaling, asymmetric
multicore architectures, multithreading, generic
accelerators, and conjoining. Then, we present a novel
methodology to explore multicore design spaces,
including the architectural methods improving the
performance density. The methodology is based on a
complex formula computing performances of heterogeneous
multicore systems. Using this design space exploration
methodology for HD and QuadHD H.264 video decoding, we
estimate that the required areas of multicores in CMOS
45 nm are 2.5 mm$^2$ and 8.6 mm$^2$, respectively.
These results suggest that heterogeneous multicores are
cost-effective for embedded applications and can
provide a good programmability support.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
author = "Antonino Tumeo and Simone Secchi and Oreste Villa",
title = "Designing Next-Generation Massively Multithreaded
Architectures for Irregular Applications",
journal = j-COMPUTER,
volume = "45",
number = "8",
pages = "53--61",
month = aug,
year = "2012",
DOI = "https://doi.org/10.1109/MC.2012.193",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Wed Aug 29 16:38:07 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Computer",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2",
author = "Oreste Villa and Antonino Tumeo and Simone Secchi and
Joseph B. Manzano",
title = "Fast and Accurate Simulation of the {Cray XMT}
Multithreaded Supercomputer",
volume = "23",
number = "12",
pages = "2266--2279",
month = dec,
year = "2012",
DOI = "https://doi.org/10.1109/TPDS.2012.70",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Nov 15 06:27:40 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Roberto Vitali and Alessandro Pellegrini and Francesco
title = "Load sharing for optimistic parallel simulations on
multi core machines",
journal = j-SIGMETRICS,
volume = "40",
number = "3",
pages = "2--11",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2425248.2425250",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Sun May 5 09:58:20 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Parallel Discrete Event Simulation (PDES) is based on
the partitioning of the simulation model into distinct
Logical Processes (LPs), each one modeling a portion of
the entire system, which are allowed to execute
simulation events concurrently. This allows exploiting
parallel computing architectures to speedup model
execution, and to make very large models tractable. In
this article we cope with the optimistic approach to
PDES, where LPs are allowed to concurrently process
their events in a speculative fashion, and rollback/
recovery techniques are used to guarantee state
consistency in case of causality violations along the
speculative execution path. Particularly, we present an
innovative load sharing approach targeted at optimizing
resource usage for fruitful simulation work when
running an optimistic PDES environment on top of
multi-processor/multi-core machines. Beyond providing
the load sharing model, we also define a load sharing
oriented architectural scheme, based on a symmetric
multi-threaded organization of the simulation platform.
Finally, we present a real implementation of the load
sharing architecture within the open source ROme
OpTimistic Simulator (ROOT-Sim) package. Experimental
data for an assessment of both viability and
effectiveness of our proposal are presented as well.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Haris Volos and Andres Jaan Tack and Michael M. Swift
and Shan Lu",
title = "Applying transactional memory to concurrency bugs",
journal = j-COMP-ARCH-NEWS,
volume = "40",
number = "1",
pages = "211--222",
month = mar,
year = "2012",
DOI = "https://doi.org/10.1145/2189750.2150999",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Jun 1 17:06:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ASPLOS '12 conference proceedings.",
abstract = "Multithreaded programs often suffer from
synchronization bugs such as atomicity violations and
deadlocks. These bugs arise from complicated locking
strategies and ad hoc synchronization methods to avoid
the use of locks. A survey of the bug databases of
major open-source applications shows that concurrency
bugs often take multiple fix attempts, and that fixes
often introduce yet more concurrency bugs.
Transactional memory (TM) enables programmers to
declare regions of code atomic without specifying a
lock and has the potential to avoid these bugs. Where
most previous studies have focused on using TM to write
new programs from scratch, we consider its utility in
fixing existing programs with concurrency bugs. We
therefore investigate four methods of using TM on three
concurrent programs. Overall, we find that 29\% of the
bugs are not fixable by transactional memory, showing
that TM does not address many important types of
concurrency bugs. In particular, TM works poorly with
extremely long critical sections and with deadlocks
involving both condition variables and I/O. Conversely,
we find that for 56\% of the bugs, transactional memory
offers demonstrable value by simplifying the reasoning
behind a fix or the effort to implement a fix, and
using transactions in the first place would have
avoided 71\% of the bugs examined. We also find that ad
hoc synchronization put in place to avoid the overhead
of locking can be greatly simplified with TM, but
requires hardware support to perform well.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Zheng Wei and Joseph Jaja",
title = "Optimization of Linked List Prefix Computations on
Multithreaded {GPUs} Using {CUDA}",
volume = "22",
number = "4",
pages = "1250012",
month = dec,
year = "2012",
DOI = "https://doi.org/10.1142/S0129626412500120",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Sat Jun 22 15:54:17 MDT 2013",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Jingyue Wu and Yang Tang and Gang Hu and Heming Cui
and Junfeng Yang",
title = "Sound and precise analysis of parallel programs
through schedule specialization",
journal = j-SIGPLAN,
volume = "47",
number = "6",
pages = "205--216",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2345156.2254090",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:49 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "PLDI '12 proceedings.",
abstract = "Parallel programs are known to be difficult to
analyze. A key reason is that they typically have an
enormous number of execution interleavings, or
schedules. Static analysis over all schedules requires
over-approximations, resulting in poor precision;
dynamic analysis rarely covers more than a tiny
fraction of all schedules. We propose an approach
called schedule specialization to analyze a parallel
program over only a small set of schedules for
precision, and then enforce these schedules at runtime
for soundness of the static analysis results. We build
a schedule specialization framework for C/C++
multithreaded programs that use Pthreads. Our framework
avoids the need to modify every analysis to be
schedule-aware by specializing a program into a simpler
program based on a schedule, so that the resultant
program can be analyzed with stock analyses for
improved precision. Moreover, our framework provides a
precise schedule-aware def-use analysis on memory
locations, enabling us to build three highly precise
analyses: an alias analyzer, a data-race detector, and
a path slicer. Evaluation on 17 programs, including 2
real-world programs and 15 popular benchmarks, shows
that analyses using our framework reduced may-aliases
by 61.9\%, false race reports by 69\%, and path slices
by 48.7\%; and detected 7 unknown bugs in well-checked
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
title = "Mixed speculative multithreaded execution models",
journal = j-TACO,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355591",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The current trend toward multicore architectures has
placed great pressure on programmers and compilers to
generate thread-parallel programs. Improved execution
performance can no longer be obtained via traditional
single-thread instruction level parallelism (ILP), but,
instead, via multithreaded execution. One notable
technique that facilitates the extraction of parallel
threads from sequential applications is thread-level
speculation (TLS). This technique allows
programmers/compilers to generate threads without
checking for inter-thread data and control dependences,
which are then transparently enforced by the hardware.
Most prior work on TLS has concentrated on thread
selection and mechanisms to efficiently support the
main TLS operations, such as squashes, data versioning,
and commits. This article seeks to enhance TLS
functionality by combining it with other speculative
multithreaded execution models. The main idea is that
TLS already requires extensive hardware support, which
when slightly augmented can accommodate other
speculative multithreaded techniques. Recognizing that
for different applications, or even program phases, the
application bottlenecks may be different, it is
reasonable to assume that the more versatile a system
is, the more efficiently it will be able to execute the
given program. Toward this direction, we first show
that mixed execution models that combine TLS with
Helper Threads (HT), RunAhead execution (RA) and
MultiPath execution (MP) perform better than any of the
models alone. Based on a simple model that we propose,
we show that benefits come from being able to extract
additional ILP without harming the TLP extracted by
TLS. We then show that by combining all the execution
models in a unified one that combines all these
speculative multithreaded models, ILP can be further
enhanced with only minimal additional cost in
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Jingling Xue",
title = "Rethinking {Java} call stack design for tiny embedded
journal = j-SIGPLAN,
volume = "47",
number = "5",
pages = "1--10",
month = may,
year = "2012",
DOI = "https://doi.org/10.1145/2345141.2248420",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 6 16:31:46 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "LCTES '12 proceedings.",
abstract = "The ability of tiny embedded devices to run large
feature-rich programs is typically constrained by the
amount of memory installed on such devices.
Furthermore, the useful operation of these devices in
wireless sensor applications is limited by their
battery life. This paper presents a call stack redesign
targeted at an efficient use of RAM storage and CPU
cycles by a Java program running on a wireless sensor
mote. Without compromising the application programs,
our call stack redesign saves 30\% of RAM, on average,
evaluated over a large number of benchmarks. On the
same set of bench-marks, our design also avoids
frequent RAM allocations and deallocations, resulting
in average 80\% fewer memory operations and 23\% faster
program execution. These may be critical improvements
for tiny embedded devices that are equipped with small
amount of RAM and limited battery life. However, our
call stack redesign is equally effective for any
complex multi-threaded object oriented program
developed for desktop computers. We describe the
redesign, measure its performance and report the
resulting savings in RAM and execution time for a wide
variety of programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Makoto Yamashita and Katsuki Fujisawa and Mituhiro
Fukuda and Kazuhide Nakata and Maho Nakata",
title = "{Algorithm 925}: Parallel Solver for Semidefinite
Programming Problem having Sparse {Schur} Complement
journal = j-TOMS,
volume = "39",
number = "1",
pages = "6:1--6:22",
month = nov,
year = "2012",
DOI = "https://doi.org/10.1145/2382585.2382591",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Thu Dec 6 07:36:30 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "A SemiDefinite Programming (SDP) problem is one of the
most central problems in mathematical optimization. SDP
provides an effective computation framework for many
research fields. Some applications, however, require
solving a large-scale SDP whose size exceeds the
capacity of a single processor both in terms of
computation time and available memory. SDPARA
(SemiDefinite Programming Algorithm paRAllel package)
[Yamashita et al. 2003b] was designed to solve such
large-scale SDPs. Its parallel performance is
outstanding for general SDPs in most cases. However,
the parallel implementation is less successful for some
sparse SDPs obtained from applications such as
Polynomial Optimization Problems (POPs) or Sensor
Network Localization (SNL) problems, since this version
of SDPARA cannot directly handle sparse Schur
Complement Matrices (SCMs). In this article we improve
SDPARA by focusing on the sparsity of the SCM and we
propose a new parallel implementation using the
formula-cost-based distribution along with a
replacement of the dense Cholesky factorization. We
verify numerically that these features are key to
solving SDPs with sparse SCMs more quickly on parallel
computing systems. The performance is further enhanced
by multithreading and the new SDPARA attains
considerable scalability in general. It also finds
solutions for extremely large-scale SDPs arising from
POPs which cannot be obtained by other solvers.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Jie Yu and Satish Narayanasamy and Cristiano Pereira
and Gilles Pokam",
title = "{Maple}: a coverage-driven testing tool for
multithreaded programs",
journal = j-SIGPLAN,
volume = "47",
number = "10",
pages = "485--502",
month = oct,
year = "2012",
DOI = "https://doi.org/10.1145/2398857.2384651",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Thu Nov 15 16:40:23 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Testing multithreaded programs is a hard problem,
because it is challenging to expose those rare
interleavings that can trigger a concurrency bug. We
propose a new thread interleaving coverage-driven
testing tool called Maple that seeks to expose untested
thread interleavings as much as possible. It memoizes
tested interleavings and actively seeks to expose
untested interleavings for a given test input to
increase interleaving coverage. We discuss several
solutions to realize the above goal. First, we discuss
a coverage metric based on a set of interleaving
idioms. Second, we discuss an online technique to
predict untested interleavings that can potentially be
exposed for a given test input. Finally, the predicted
untested interleavings are exposed by actively
controlling the thread schedule while executing for the
test input. We discuss our experiences in using the
tool to expose several known and unknown bugs in
real-world applications such as Apache and MySQL.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '12 conference proceedings.",
author = "Eddy Zheng Zhang and Yunlian Jiang and Xipeng Shen",
title = "The Significance of {CMP} Cache Sharing on
Contemporary Multithreaded Applications",
volume = "23",
number = "2",
pages = "367--374",
month = feb,
year = "2012",
DOI = "https://doi.org/10.1109/TPDS.2011.130",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Mar 01 14:47:13 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Sergey Zhuravlev and Juan Carlos Saez and Sergey
Blagodurov and Alexandra Fedorova and Manuel Prieto",
title = "Survey of scheduling techniques for addressing shared
resources in multicore processors",
journal = j-COMP-SURV,
volume = "45",
number = "1",
pages = "4:1--4:??",
month = nov,
year = "2012",
DOI = "https://doi.org/10.1145/2379776.2379780",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Thu Dec 6 10:55:59 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/surveys/;
abstract = "Chip multicore processors (CMPs) have emerged as the
dominant architecture choice for modern computing
platforms and will most likely continue to be dominant
well into the foreseeable future. As with any system,
CMPs offer a unique set of challenges. Chief among them
is the shared resource contention that results because
CMP cores are not independent processors but rather
share common resources among cores such as the last
level cache (LLC). Shared resource contention can lead
to severe and unpredictable performance impact on the
threads running on the CMP. Conversely, CMPs offer
tremendous opportunities for multithreaded
applications, which can take advantage of simultaneous
thread execution as well as fast inter thread data
sharing. Many solutions have been proposed to deal with
the negative aspects of CMPs and take advantage of the
positive. This survey focuses on the subset of these
solutions that exclusively make use of OS thread-level
scheduling to achieve their goals. These solutions are
particularly attractive as they require no changes to
hardware and minimal or no changes to the OS. The OS
scheduler has expanded well beyond its original role of
time-multiplexing threads on a single core into a
complex and effective resource manager. This article
surveys a multitude of new and exciting work that
explores the diverse new roles the OS scheduler can
successfully take on.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
author = "Bernhard Beckert and Vladimir Klebanov",
title = "A {Dynamic Logic} for deductive verification of
multi-threaded programs",
journal = j-FORM-ASP-COMPUT,
volume = "25",
number = "3",
pages = "405--437",
month = may,
year = "2013",
DOI = "https://doi.org/10.1007/s00165-012-0261-4",
ISSN = "0934-5043 (print), 1433-299X (electronic)",
ISSN-L = "0934-5043",
bibdate = "Wed Mar 18 05:35:14 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib;
URL = "http://link.springer.com/article/10.1007/s00165-012-0261-4",
acknowledgement = ack-nhfb,
fjournal = "Formal Aspects of Computing",
journal-URL = "http://link.springer.com/journal/165",
author = "Tom Bergan and Luis Ceze and Dan Grossman",
title = "Input-covering schedules for multithreaded programs",
journal = j-SIGPLAN,
volume = "48",
number = "10",
pages = "677--692",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1145/2544173.2509508",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 9 09:19:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "OOPSLA '13 conference proceedings.",
abstract = "We propose constraining multithreaded execution to
small sets of input-covering schedules, which we define
as follows: given a program $P$, we say that a set of
schedules $ \Sigma $ covers all inputs of program $P$
if, when given any input, $P$'s execution can be
constrained to some schedule in $ \Sigma $ and still
produce a semantically valid result. Our approach is to
first compute a small $ \Sigma $ for a given program
$P$, and then, at runtime, constrain $P$'s execution to
always follow some schedule in $ \Sigma $, and never
deviate. We have designed an algorithm that uses
symbolic execution to systematically enumerate a set of
input-covering schedules, $ \Sigma $. To deal with
programs that run for an unbounded length of time, we
partition execution into bounded epochs, find
input-covering schedules for each epoch in isolation,
and then piece the schedules together at runtime. We
have implemented this algorithm along with a
constrained execution runtime for pthreads programs,
and we report results Our approach has the following
advantage: because all possible runtime schedules are
known a priori, we can seek to validate the program by
thoroughly verifying each schedule in $ \Sigma $, in
isolation, without needing to reason about the huge
space of thread interleavings that arises due to
conventional nondeterministic execution.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Kristof {Du Bois} and Jennifer B. Sartor and Stijn
Eyerman and Lieven Eeckhout",
title = "Bottle graphs: visualizing scalability bottlenecks in
multi-threaded applications",
journal = j-SIGPLAN,
volume = "48",
number = "10",
pages = "355--372",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1145/2544173.2509529",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 9 09:19:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "OOPSLA '13 conference proceedings.",
abstract = "Understanding and analyzing multi-threaded program
performance and scalability is far from trivial, which
severely complicates parallel software development and
optimization. In this paper, we present bottle graphs,
a powerful analysis tool that visualizes multi-threaded
program performance, in regards to both per-thread
parallelism and execution time. Each thread is
represented as a box, with its height equal to the
share of that thread in the total program execution
time, its width equal to its parallelism, and its area
equal to its total running time. The boxes of all
threads are stacked upon each other, leading to a stack
with height equal to the total program execution time.
Bottle graphs show exactly how scalable each thread is,
and thus guide optimization towards those threads that
have a smaller parallel component (narrower), and a
larger share of the total execution time (taller), i.e.
to the 'neck' of the bottle. Using light-weight OS
modules, we calculate bottle graphs for unmodified
multi-threaded programs running on real processors with
an average overhead of 0.68\%. To demonstrate their
utility, we do an extensive analysis of 12 Java
benchmarks running on top of the Jikes JVM, which
introduces many JVM service threads. We not only reveal
and explain scalability limitations of several
well-known Java benchmarks; we also analyze the reasons
why the garbage collector itself does not scale, and in
fact performs optimally with two collector threads for
all benchmarks, regardless of the number of application
threads. Finally, we compare the scalability of Jikes
versus the OpenJDK JVM. We demonstrate how useful and
intuitive bottle graphs are as a tool to analyze
scalability and help optimize multi-threaded
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Michael Bond",
title = "{GPUDet}: a deterministic {GPU} architecture",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "1--12",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451118",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Nondeterminism is a key challenge in developing
multithreaded applications. Even with the same input,
each execution of a multithreaded program may produce a
different output. This behavior complicates debugging
and limits one's ability to test for correctness. This
non-reproducibility situation is aggravated on
massively parallel architectures like graphics
processing units (GPUs) with thousands of concurrent
threads. We believe providing a deterministic
environment to ease debugging and testing of GPU
applications is essential to enable a broader class of
software to use GPUs. Many hardware and software
techniques have been proposed for providing determinism
on general-purpose multi-core processors. However,
these techniques are designed for small numbers of
threads. Scaling them to thousands of threads on a GPU
is a major challenge. This paper proposes a scalable
hardware mechanism, GPUDet, to provide determinism in
GPU architectures. In this paper we characterize the
existing deterministic and nondeterministic aspects of
current GPU execution models, and we use these
observations to inform GPUDet's design. For example,
GPUDet leverages the inherent determinism of the SIMD
hardware in GPUs to provide determinism within a
wavefront at no cost. GPUDet also exploits the Z-Buffer
Unit, an existing GPU hardware unit for graphics
rendering, to allow parallel out-of-order memory writes
to produce a deterministic output. Other optimizations
in GPUDet include deterministic parallel execution of
atomic operations and a workgroup-aware algorithm that
eliminates unnecessary global synchronizations. Our
simulation results indicate that GPUDet incurs only 2X
slowdown on average over a baseline nondeterministic
architecture, with runtime overheads as low as 4\% for
compute-bound applications, despite running GPU kernels
with thousands of threads. We also characterize the
sources of overhead for deterministic execution on GPUs
to provide insights for further optimizations.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Michael D. Bond and Milind Kulkarni and Man Cao and
Minjia Zhang and Meisam Fathi Salmi and Swarnendu
Biswas and Aritra Sengupta and Jipeng Huang",
title = "{OCTET}: capturing and controlling cross-thread
dependences efficiently",
journal = j-SIGPLAN,
volume = "48",
number = "10",
pages = "693--712",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1145/2544173.2509519",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 9 09:19:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "OOPSLA '13 conference proceedings.",
abstract = "Parallel programming is essential for reaping the
benefits of parallel hardware, but it is notoriously
difficult to develop and debug reliable, scalable
software systems. One key challenge is that modern
languages and systems provide poor support for ensuring
concurrency correctness properties --- atomicity,
sequential consistency, and multithreaded determinism
--- because all existing approaches are impractical.
Dynamic, software-based approaches slow programs by up
to an order of magnitude because capturing and
controlling cross-thread dependences (i.e., conflicting
accesses to shared memory) requires synchronization at
virtually every access to potentially shared memory.
This paper introduces a new software-based concurrency
control mechanism called OCTET that soundly captures
cross-thread dependences and can be used to build
dynamic analyses for concurrency correctness. OCTET
achieves low overheads by tracking the locality state
of each potentially shared object. Non-conflicting
accesses conform to the locality state and require no
synchronization; only conflicting accesses require a
state change and heavyweight synchronization. This
optimistic tradeoff leads to significant efficiency
gains in capturing cross-thread dependences: a
prototype implementation of OCTET in a high-performance
Java virtual machine slows real-world concurrent
programs by only 26\% on average. A dependence
recorder, suitable for record {\&} replay, built on top
of OCTET adds an additional 5\% overhead on average.
These results suggest that OCTET can provide a
foundation for developing low-overhead analyses that
check and enforce concurrency correctness.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Ahmed Bouajjani and Michael Emmi",
title = "Analysis of Recursively Parallel Programs",
journal = j-TOPLAS,
volume = "35",
number = "3",
pages = "10:1--10:??",
month = nov,
year = "2013",
DOI = "https://doi.org/10.1145/2518188",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Nov 8 17:09:04 MST 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "We propose a general formal model of isolated
hierarchical parallel computations, and identify
several fragments to match the concurrency constructs
present in real-world programming languages such as
Cilk and X10. By associating fundamental formal models
(vector addition systems with recursive transitions) to
each fragment, we provide a common platform for
exposing the relative difficulties of algorithmic
reasoning. For each case we measure the complexity of
deciding state reachability for finite-data recursive
programs, and propose algorithms for the decidable
cases. The complexities which include PTIME, NP,
EXPSPACE, and 2EXPTIME contrast with undecidable state
reachability for recursive multithreaded programs.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Alfredo Buttari",
title = "Fine-Grained Multithreading for the Multifrontal {$ Q
R $} Factorization of Sparse Matrices",
journal = j-SIAM-J-SCI-COMP,
volume = "35",
number = "4",
pages = "C323--C345",
month = "????",
year = "2013",
DOI = "https://doi.org/10.1137/110846427",
ISSN = "1064-8275 (print), 1095-7197 (electronic)",
ISSN-L = "1064-8275",
bibdate = "Fri Jul 19 07:44:01 MDT 2013",
bibsource = "http://epubs.siam.org/sam-bin/dbq/toc/SISC/35/4;
acknowledgement = ack-nhfb,
fjournal = "SIAM Journal on Scientific Computing",
journal-URL = "http://epubs.siam.org/sisc",
onlinedate = "January 2013",
author = "Gianpiero Cabodi and Sergio Nocco and Stefano Quer",
title = "Thread-based multi-engine model checking for multicore
journal = j-TODAES,
volume = "18",
number = "3",
pages = "36:1--36:??",
month = jul,
year = "2013",
DOI = "https://doi.org/10.1145/2491477.2491480",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Sat Jul 27 08:09:07 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "This article describes a multithreaded,
portfolio-based approach to model checking, where
multiple cores are exploited as the underlying
computing framework to support concurrent execution of
cooperative engines. We introduce a portfolio-based
approach to model checking. Our portfolio is first
driven by an approximate runtime predictor that
provides a heuristic approximation to a perfect oracle
and suggests which engines are more suitable for each
verification instance. Scalability and robustness of
the overall model-checking effort highly rely on a
concurrent, multithreaded model of execution. Following
similar approaches in related application fields, we
dovetail data partitioning, focused on proving several
properties in parallel, and engine partitioning, based
on concurrent runs of different model-checking engines
competing for completion of the same problem. We
investigate concurrency not only to effectively exploit
several available engines, which operate independently,
but also to show that a cooperative effort is possible.
In this case, we adopt a straightforward, light-weight,
model of inter-engine communication and data sharing.
We provide a detailed description of the ideas,
algorithms, and experimental results obtained on the
benchmarks from the Hardware Model Checking Competition
suites (HWMCC'10 and HWMCC'11).",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Yan Cai and Ke Zhai and Shangru Wu and W. K. Chan",
title = "{TeamWork}: synchronizing threads globally to detect
real deadlocks for multithreaded programs",
journal = j-SIGPLAN,
volume = "48",
number = "8",
pages = "311--312",
month = aug,
year = "2013",
DOI = "https://doi.org/10.1145/2517327.2442560",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 26 13:48:51 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "PPoPP '13 Conference proceedings.",
abstract = "This paper presents the aim of TeamWork, our ongoing
effort to develop a comprehensive dynamic deadlock
confirmation tool for multithreaded programs. It also
presents a refined object abstraction algorithm that
refines the existing stack hash abstraction.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Harold W. Cain and Maged M. Michael and Brad Frey and
Cathy May and Derek Williams and Hung Le",
title = "Robust architectural support for transactional memory
in the {Power} architecture",
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "225--236",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485942",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "On the twentieth anniversary of the original
publication [10], following ten years of intense
activity in the research literature, hardware support
for transactional memory (TM) has finally become a
commercial reality, with HTM-enabled chips currently or
soon-to-be available from many hardware vendors. In
this paper we describe architectural support for TM
added to a future version of the Power ISA{\TM}. Two
imperatives drove the development: the desire to
complement our weakly-consistent memory model with a
more friendly interface to simplify the development and
porting of multithreaded applications, and the need for
robustness beyond that of some early implementations.
In the process of commercializing the feature, we had
to resolve some previously unexplored interactions
between TM and existing features of the ISA, for
example translation shootdown, interrupt handling,
atomic read-modify-write primitives, and our weakly
consistent memory model. We describe these
interactions, the overall architecture, and discuss the
motivation and rationale for our choices of
architectural semantics, beyond what is typically found
in reference manuals.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Eric S. Chung and John D. Davis and Jaewon Lee",
title = "{LINQits}: big data on little clients",
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "261--272",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485945",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "We present LINQits, a flexible hardware template that
can be mapped onto programmable logic or ASICs in a
heterogeneous system-on-chip for a mobile device or
server. Unlike fixed-function accelerators, LINQits
accelerates a domain-specific query language called
LINQ. LINQits does not provide coverage for all
possible applications --- however, existing
applications (re-)written with LINQ in mind benefit
extensively from hardware acceleration. Furthermore,
the LINQits framework offers a graceful and transparent
migration path from software to hardware. LINQits is
prototyped on a 2W heterogeneous SoC called the ZYNQ
processor, which combines dual ARM A9 processors with
an FPGA on a single die in 28nm silicon technology. Our
physical measurements show that LINQits improves energy
efficiency by 8.9 to 30.6 times and performance by 10.7
to 38.1 times compared to optimized, multithreaded C
programs running on conventional ARM A9 processors.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Delphine Demange and Vincent Laporte and Lei Zhao and
Suresh Jagannathan and David Pichardie and Jan Vitek",
title = "{Plan B}: a buffered memory model for {Java}",
journal = j-SIGPLAN,
volume = "48",
number = "1",
pages = "329--342",
month = jan,
year = "2013",
DOI = "https://doi.org/10.1145/2480359.2429110",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:03 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Recent advances in verification have made it possible
to envision trusted implementations of real-world
languages. Java with its type-safety and fully
specified semantics would appear to be an ideal
candidate; yet, the complexity of the translation steps
used in production virtual machines have made it a
challenging target for verifying compiler technology.
One of Java's key innovations, its memory model, poses
significant obstacles to such an endeavor. The Java
Memory Model is an ambitious attempt at specifying the
behavior of multithreaded programs in a portable,
hardware agnostic, way. While experts have an intuitive
grasp of the properties that the model should enjoy,
the specification is complex and not well-suited for
integration within a verifying compiler infrastructure.
Moreover, the specification is given in an axiomatic
style that is distant from the intuitive
reordering-based reasonings traditionally used to
justify or rule out behaviors, and ill suited to the
kind of operational reasoning one would expect to
employ in a compiler. This paper takes a step back, and
introduces a Buffered Memory Model (BMM) for Java. We
choose a pragmatic point in the design space
sacrificing generality in favor of a model that is
fully characterized in terms of the reorderings it
allows, amenable to formal reasoning, and which can be
efficiently applied to a specific hardware family,
namely x86 multiprocessors. Although the BMM restricts
the reorderings compilers are allowed to perform, it
serves as the key enabling device to achieving a
verification pathway from bytecode to machine
instructions. Despite its restrictions, we show that it
is backwards compatible with the Java Memory Model and
that it does not cripple performance on TSO
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '13 conference proceedings.",
author = "Kristof {Du Bois} and Stijn Eyerman and Jennifer B.
Sartor and Lieven Eeckhout",
title = "Criticality stacks: identifying critical threads in
parallel programs using synchronization behavior",
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "511--522",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485966",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "Analyzing multi-threaded programs is quite
challenging, but is necessary to obtain good multicore
performance while saving energy. Due to
synchronization, certain threads make others wait,
because they hold a lock or have yet to reach a
barrier. We call these critical threads, i.e., threads
whose performance is determinative of program
performance as a whole. Identifying these threads can
reveal numerous optimization opportunities, for the
software developer and for hardware. In this paper, we
propose a new metric for assessing thread criticality,
which combines both how much time a thread is
performing useful work and how many co-running threads
are waiting. We show how thread criticality can be
calculated online with modest hardware additions and
with low overhead. We use our metric to create
criticality stacks that break total execution time into
each thread's criticality component, allowing for easy
visual analysis of parallel imbalance. To validate our
criticality metric, and demonstrate it is better than
previous metrics, we scale the frequency of the most
critical thread and show it achieves the largest
performance improvement. We then demonstrate the broad
applicability of criticality stacks by using them to
perform three types of optimizations: (1) program
analysis to remove parallel bottlenecks, (2)
dynamically identifying the most critical thread and
accelerating it using frequency scaling to improve
performance, and (3) showing that accelerating only the
most critical thread allows for targeted energy
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "David Ediger and Karl Jiang and E. Jason Riedy and
David A. Bader",
title = "{GraphCT}: Multithreaded Algorithms for Massive Graph
volume = "24",
number = "11",
pages = "2220--2229",
month = nov,
year = "2013",
DOI = "https://doi.org/10.1109/TPDS.2012.323",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Nov 15 10:31:20 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "P. Ferrara",
title = "A generic static analyzer for multithreaded {Java}
journal = j-SPE,
volume = "43",
number = "6",
pages = "663--684",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1002/spe.2126",
ISSN = "0038-0644 (print), 1097-024X (electronic)",
ISSN-L = "0038-0644",
bibdate = "Tue Dec 3 10:30:05 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Software --- Practice and Experience",
journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X",
onlinedate = "9 May 2012",
author = "Nima Honarmand and Nathan Dautenhahn and Josep
Torrellas and Samuel T. King and Gilles Pokam and
Cristiano Pereira",
title = "{Cyrus}: unintrusive application-level record-replay
for replay parallelism",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "193--206",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451138",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Architectures for deterministic record-replay (R\&R)
of multithreaded code are attractive for program
debugging, intrusion analysis, and fault-tolerance
uses. However, very few of the proposed designs have
focused on maximizing replay speed --- a key enabling
property of these systems. The few efforts that focus
on replay speed require intrusive hardware or software
modifications, or target whole-system R\&R rather than
the more useful application-level R\&R. This paper
presents the first hardware-based scheme for
unintrusive, application-level R\&R that explicitly
targets high replay speed. Our scheme, called Cyrus,
requires no modification to commodity snoopy cache
coherence. It introduces the concept of an on-the-fly
software Backend Pass during recording which, as the
log is being generated, transforms it for high replay
parallelism. This pass also fixes-up the log, and can
flexibly trade-off replay parallelism for log size. We
analyze the performance of Cyrus using full system (OS
plus hardware) simulation. Our results show that Cyrus
has negligible recording overhead. In addition, for
8-processor runs of SPLASH-2, Cyrus attains an average
replay parallelism of 5, and a replay speed that is, on
average, only about 50\% lower than the recording
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Jeff Huang and Charles Zhang and Julian Dolby",
title = "{CLAP}: recording local executions to reproduce
concurrency failures",
journal = j-SIGPLAN,
volume = "48",
number = "6",
pages = "141--152",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2499370.2462167",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:38 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present CLAP, a new technique to reproduce
concurrency bugs. CLAP has two key steps. First, it
logs thread local execution paths at runtime. Second,
offline, it computes memory dependencies that accord
with the logged execution and are able to reproduce the
observed bug. The second step works by combining
constraints from the thread paths and constraints based
on a memory model, and computing an execution with a
constraint solver. CLAP has four major advantages.
First, logging purely local execution of each thread is
substantially cheaper than logging memory interactions,
which enables CLAP to be efficient compared to previous
approaches. Second, our logging does not require any
synchronization and hence with no added memory barriers
or fences; this minimizes perturbation and missed bugs
due to extra synchronizations foreclosing certain racy
behaviors. Third, since it uses no synchronization, we
extend CLAP to work on a range of relaxed memory
models, such as TSO and PSO, in addition to sequential
consistency. Fourth, CLAP can compute a much simpler
execution than the original one, that reveals the bug
with minimal thread context switches. To mitigate the
scalability issues, we also present an approach to
parallelize constraint solving, which theoretically
scales our technique to programs with arbitrary
execution length. Experimental results on a variety of
multithreaded benchmarks and real world concurrent
applications validate these advantages by showing that
our technique is effective in reproducing concurrency
bugs even under relaxed memory models; furthermore, it
is significantly more efficient than a state-of-the-art
technique that records shared memory dependencies,
reducing execution time overhead by 45\% and log size
by 88\% on average.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '13 conference proceedings.",
author = "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven
D. Gribble",
title = "{DDOS}: taming nondeterminism in distributed systems",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "499--508",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451170",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Nondeterminism complicates the development and
management of distributed systems, and arises from two
main sources: the local behavior of each individual
node as well as the behavior of the network connecting
them. Taming nondeterminism effectively requires
dealing with both sources. This paper proposes DDOS, a
system that leverages prior work on deterministic
multithreading to offer: (1) space-efficient
record/replay of distributed systems; and (2) fully
deterministic distributed behavior. Leveraging
deterministic behavior at each node makes outgoing
messages strictly a function of explicit inputs. This
allows us to record the system by logging just
message's arrival time, not the contents. Going
further, we propose and implement an algorithm that
makes all communication between nodes deterministic by
scheduling communication onto a global logical
timeline. We implement both algorithms in a system
called DDOS and evaluate our system with parallel
scientific applications, an HTTP/memcached system and a
distributed microbenchmark with a high volume of
peer-to-peer communication. Our results show up to two
orders of magnitude reduction in log size of
record/replay, and that distributed systems can be made
deterministic with an order of magnitude of overhead.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
and Yale N. Patt",
title = "Utility-based acceleration of multithreaded
applications on asymmetric {CMPs}",
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "154--165",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485936",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "Asymmetric Chip Multiprocessors (ACMPs) are becoming a
reality. ACMPs can speed up parallel applications if
they can identify and accelerate code segments that are
critical for performance. Proposals already exist for
using coarse-grained thread scheduling and fine-grained
bottleneck acceleration. Unfortunately, there have been
no proposals offered thus far to decide which code
segments to accelerate in cases where both
coarse-grained thread scheduling and fine-grained
bottleneck acceleration could have value. This paper
proposes Utility-Based Acceleration of Multithreaded
Applications on Asymmetric CMPs (UBA), a cooperative
software/hardware mechanism for identifying and
accelerating the most likely critical code segments
from a set of multithreaded applications running on an
ACMP. The key idea is a new Utility of Acceleration
metric that quantifies the performance benefit of
accelerating a bottleneck or a thread by taking into
account both the criticality and the expected speedup.
UBA outperforms the best of two state-of-the-art
mechanisms by 11\% for single application workloads and
by 7\% for two-application workloads on an ACMP with 52
small cores and 3 large cores.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Melanie Kambadur and Kui Tang and Joshua Lopez and
Martha A. Kim",
title = "Parallel scaling properties from a basic block view",
journal = j-SIGMETRICS,
volume = "41",
number = "1",
pages = "365--366",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2494232.2465748",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Feb 28 06:09:59 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "As software scalability lags behind hardware
parallelism, understanding scaling behavior is more
important than ever. This paper demonstrates how to use
Parallel Block Vector (PBV) profiles to measure the
scaling properties of multithreaded programs from a new
perspective: the basic block's view. Through this lens,
we guide users through quick and simple methods to
produce high-resolution application scaling analyses.
This method requires no manual program modification,
new hardware, or lengthy simulations, and captures the
impact of architecture, operating systems, threading
models, and inputs. We apply these techniques to a set
of parallel benchmarks, and, as an example, demonstrate
that when it comes to scaling, functions in an
application do not behave monolithically.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and
Joonwon Lee and Seungryoul Maeng",
title = "Demand-based coordinated scheduling for {SMP VMs}",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "369--380",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451156",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "As processor architectures have been enhancing their
computing capacity by increasing core counts,
independent workloads can be consolidated on a single
node for the sake of high resource efficiency in data
centers. With the prevalence of virtualization
technology, each individual workload can be hosted on a
virtual machine for strong isolation between co-located
workloads. Along with this trend, hosted applications
have increasingly been multithreaded to take advantage
of improved hardware parallelism. Although the
performance of many multithreaded applications highly
depends on communication (or synchronization) latency,
existing schemes of virtual machine scheduling do not
explicitly coordinate virtual CPUs based on their
communication behaviors. This paper presents a
demand-based coordinated scheduling scheme for
consolidated virtual machines that host multithreaded
workloads. To this end, we propose communication-driven
scheduling that controls time-sharing in response to
inter-processor interrupts (IPIs) between virtual CPUs.
On the basis of in-depth analysis on the relationship
between IPI communications and coordination demands, we
devise IPI-driven coscheduling and delayed preemption
schemes, which effectively reduce synchronization
latency and unnecessary CPU consumption. In addition,
we introduce a load-conscious CPU allocation policy in
order to address load imbalance in heterogeneously
consolidated environments. The proposed schemes are
evaluated with respect to various scenarios of mixed
workloads using the PARSEC multithreaded applications.
In the evaluation, our scheme improves the overall
performance of consolidated workloads, especially
communication-intensive applications, by reducing
inefficient synchronization latency.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Patrick A. {La Fratta} and Peter M. Kogge",
title = "Energy-efficient multithreading for a hierarchical
heterogeneous multicore through locality-cognizant
thread generation",
journal = j-J-PAR-DIST-COMP,
volume = "73",
number = "12",
pages = "1551--1562",
month = dec,
year = "2013",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Nov 29 09:55:28 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731513001494",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Jacobo Lobeiras and Mois{\'e}s Vi{\~n}as and Margarita
Amor and Basilio B. Fraguela and Manuel Arenaz and J.
A. Garc{\'\i}a and M. J. Castro",
title = "Parallelization of shallow water simulations on
current multi-threaded systems",
journal = j-IJHPCA,
volume = "27",
number = "4",
pages = "493--512",
month = nov,
year = "2013",
DOI = "https://doi.org/10.1177/1094342012464800",
ISSN = "1094-3420 (print), 1741-2846 (electronic)",
ISSN-L = "1094-3420",
bibdate = "Fri Mar 14 15:39:57 MDT 2014",
bibsource = "http://hpc.sagepub.com/content/27/4.toc;
URL = "http://hpc.sagepub.com/content/27/4/493.full.pdf+html",
acknowledgement = ack-nhfb,
fjournal = "International Journal of High Performance Computing
journal-URL = "http://hpc.sagepub.com/content/by/year",
onlinedate = "December 5, 2012",
author = "Kai Lu and Xu Zhou and Xiaoping Wang and Wenzhe Zhang
and Gen Li",
title = "{RaceFree}: an efficient multi-threading model for
journal = j-SIGPLAN,
volume = "48",
number = "8",
pages = "297--298",
month = aug,
year = "2013",
DOI = "https://doi.org/10.1145/2517327.2442553",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Aug 26 13:48:51 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "PPoPP '13 Conference proceedings.",
abstract = "Current deterministic systems generally incur large
overhead due to the difficulty of detecting and
eliminating data races. This paper presents RaceFree, a
novel multi-threading runtime that adopts a relaxed
deterministic model to provide a data-race-free
environment for parallel programs. This model cuts off
unnecessary shared-memory communication by isolating
threads in separated memories, which eliminates direct
data races. Meanwhile, we leverage the happen-before
relation defined by applications themselves as one-way
communication pipes to perform necessary thread
communication. Shared-memory communication is
transparently converted to message-passing style
communication by our Memory Modification Propagation
(MMP) mechanism, which propagates local memory
modifications to other threads through the
happen-before relation pipes. The overhead of RaceFree
is 67.2\% according to our tests on parallel
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Brandon Lucia and Luis Ceze",
title = "Cooperative empirical failure avoidance for
multithreaded programs",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "39--50",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451121",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Concurrency errors in multithreaded programs are
difficult to find and fix. We propose Aviso, a system
for avoiding schedule-dependent failures. Aviso
monitors events during a program's execution and, when
a failure occurs, records a history of events from the
failing execution. It uses this history to generate
schedule constraints that perturb the order of events
in the execution and thereby avoids schedules that lead
to failures in future program executions. Aviso
leverages scenarios where many instances of the same
software run, using a statistical model of program
behavior and experimentation to determine which
constraints most effectively avoid failures. After
implementing Aviso, we showed that it decreased failure
rates for a variety of important desktop, server, and
cloud applications by orders of magnitude, with an
average overhead of less than 20\% and, in some cases,
as low as 5\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Basel A. Mahafzah",
title = "Performance assessment of multithreaded quicksort
algorithm on simultaneous multithreaded architecture",
volume = "66",
number = "1",
pages = "339--363",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1007/s11227-013-0910-2",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Sat Feb 8 10:21:52 MST 2014",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=66&issue=1;
URL = "http://link.springer.com/article/10.1007/s11227-013-0910-2",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Ciaran McCreesh and Patrick Prosser",
title = "Multi-Threading a State-of-the-Art Maximum Clique
volume = "6",
number = "4",
pages = "618--635",
month = dec,
year = "2013",
DOI = "https://doi.org/10.3390/a6040618",
ISSN = "1999-4893 (electronic)",
ISSN-L = "1999-4893",
bibdate = "Fri May 3 13:50:13 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib;
URL = "https://www.mdpi.com/1999-4893/6/4/618",
acknowledgement = ack-nhfb,
fjournal = "Algorithms (Basel)",
journal-URL = "https://www.mdpi.com/journal/algorithms",
pubdates = "Received: 15 August 2013 / Revised: 13 September 2013
/ Accepted: 18 September 2013 / Published: 3 October
author = "Brian Norris and Brian Demsky",
title = "{CDSChecker}: checking concurrent data structures
written with {C\slash C++} atomics",
journal = j-SIGPLAN,
volume = "48",
number = "10",
pages = "131--150",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1145/2544173.2509514",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 9 09:19:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "OOPSLA '13 conference proceedings.",
abstract = "Writing low-level concurrent software has
traditionally required intimate knowledge of the entire
toolchain and often has involved coding in assembly.
New language standards have extended C and C++ with
support for low-level atomic operations and a weak
memory model, enabling developers to write portable and
efficient multithreaded code. Developing correct
low-level concurrent code is well-known to be
especially difficult under a weak memory model, where
code behavior can be surprising. Building reliable
concurrent software using C/C++ low-level atomic
operations will likely require tools that help
developers discover unexpected program behaviors. In
this paper we present CDSChecker, a tool for
exhaustively exploring the behaviors of concurrent code
under the C/C++ memory model. We develop several novel
techniques for modeling the relaxed behaviors allowed
by the memory model and for minimizing the number of
execution behaviors that CDSChecker must explore. We
have used CDSChecker to exhaustively unit test several
concurrent data structure implementations on specific
inputs and have discovered errors in both a recently
published C11 implementation of a work-stealing queue
and a single producer, single consumer queue
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Christopher O{\ss}ner and Klemens B{\"o}hm",
title = "Graphs for Mining-Based Defect Localization in
Multithreaded Programs",
journal = j-INT-J-PARALLEL-PROG,
volume = "41",
number = "4",
pages = "570--593",
month = aug,
year = "2013",
DOI = "https://doi.org/10.1007/s10766-012-0237-2",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Jun 22 12:29:22 MDT 2013",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=41&issue=4;
URL = "http://link.springer.com/article/10.1007/s10766-012-0237-2",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Angshuman Parashar and Michael Pellauer and Michael
Adler and Bushra Ahsan and Neal Crago and Daniel Lustig
and Vladimir Pavlov and Antonia Zhai and Mohit Gambhir
and Aamer Jaleel and Randy Allmon and Rachid Rayess and
Stephen Maresh and Joel Emer",
title = "Triggered instructions: a control paradigm for
spatially-programmed architectures",
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "142--153",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485935",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "In this paper, we present triggered instructions, a
novel control paradigm for arrays of processing
elements (PEs) aimed at exploiting spatial parallelism.
Triggered instructions completely eliminate the program
counter and allow programs to transition concisely
between states without explicit branch instructions.
They also allow efficient reactivity to inter-PE
communication traffic. The approach provides a unified
mechanism to avoid over-serialized execution,
essentially achieving the effect of techniques such as
dynamic instruction reordering and multithreading,
which each require distinct hardware mechanisms in a
traditional sequential architecture. Our analysis shows
that a triggered-instruction based spatial accelerator
can achieve 8X greater area-normalized performance than
a traditional general-purpose processor. Further
analysis shows that triggered control reduces the
number of static and dynamic instructions in the
critical paths by 62\% and 64\% respectively over a
program-counter style spatial baseline, resulting in a
speedup of 2.0X.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Gilles Pokam and Klaus Danne and Cristiano Pereira and
Rolf Kassa and Tim Kranich and Shiliang Hu and Justin
Gottschlich and Nima Honarmand and Nathan Dautenhahn
and Samuel T. King and Josep Torrellas",
title = "{QuickRec}: prototyping an {Intel} architecture
extension for record and replay of multithreaded
journal = j-COMP-ARCH-NEWS,
volume = "41",
number = "3",
pages = "643--654",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2508148.2485977",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Sat Jul 27 06:58:55 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ICSA '13 conference proceedings.",
abstract = "There has been significant interest in
hardware-assisted deterministic Record and Replay (RnR)
systems for multithreaded programs on multiprocessors.
However, no proposal has implemented this technique in
a hardware prototype with full operating system
support. Such an implementation is needed to assess RnR
practicality. This paper presents QuickRec, the first
multicore Intel Architecture (IA) prototype of RnR for
multithreaded programs. QuickRec is based on QuickIA,
an Intel emulation platform for rapid prototyping of
new IA extensions. QuickRec is composed of a Xeon
server platform with FPGA-emulated second-generation
Pentium cores, and Capo3, a full software stack for
managing the recording hardware from within a modified
Linux kernel. This paper's focus is understanding and
evaluating the implementation issues of RnR on a real
platform. Our effort leads to some lessons learned, as
well as to some pointers for future research. We
demonstrate that RnR can be implemented efficiently on
a real multicore IA system. In particular, we show that
the rate of memory log generation is insignificant, and
that the recording hardware has negligible performance
overhead. However, the software stack incurs an average
recording overhead of nearly 13\%, which must be
reduced to enable always-on use of RnR.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Veselin Raychev and Martin Vechev and Manu Sridharan",
title = "Effective race detection for event-driven programs",
journal = j-SIGPLAN,
volume = "48",
number = "10",
pages = "151--166",
month = oct,
year = "2013",
DOI = "https://doi.org/10.1145/2544173.2509538",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Dec 9 09:19:33 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "OOPSLA '13 conference proceedings.",
abstract = "Like shared-memory multi-threaded programs,
event-driven programs such as client-side web
applications are susceptible to data races that are
hard to reproduce and debug. Race detection for such
programs is hampered by their pervasive use of ad hoc
synchronization, which can lead to a prohibitive number
of false positives. Race detection also faces a
scalability challenge, as a large number of
short-running event handlers can quickly overwhelm
standard vector-clock-based techniques. This paper
presents several novel contributions that address both
of these challenges. First, we introduce race coverage,
a systematic method for exposing ad hoc synchronization
and other (potentially harmful) races to the user,
significantly reducing false positives. Second, we
present an efficient connectivity algorithm for
computing race coverage. The algorithm is based on
chain decomposition and leverages the structure of
event-driven programs to dramatically decrease the
overhead of vector clocks. We implemented our
techniques in a tool called EventRacer and evaluated it
on a number of public web sites. The results indicate
substantial performance and precision improvements of
our approach over the state-of-the-art. Using
EventRacer, we found many harmful races, most of which
are beyond the reach of current techniques.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
author = "Juan Carlos S{\'a}ez and Fernando Castro and Daniel
Chaver and Manuel Prieto",
title = "Delivering fairness and priority enforcement on
asymmetric multicore systems via {OS} scheduling",
journal = j-SIGMETRICS,
volume = "41",
number = "1",
pages = "343--344",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2494232.2465532",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Feb 28 06:09:59 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Symmetric-ISA (instruction set architecture)
asymmetric-performance multicore processors (AMPs) were
shown to deliver higher performance per watt and area
than symmetric CMPs for applications with diverse
architectural requirements. So, it is likely that
future multicore processors will combine big
power-hungry fast cores and small low-power slow ones.
In this paper, we propose a novel thread scheduling
algorithm that aims to improve the throughput-fairness
trade-off on AMP systems. Our experimental evaluation
on real hardware and using scheduler implementations on
a general-purpose operating system, reveals that our
proposal delivers a better throughput-fairness
trade-off than previous schedulers for a wide variety
of multi-application workloads including
single-threaded and multithreaded applications.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Nareg Sinenian and Alex B. Zylstra and Mario J.-E.
Manuel and Johan A. Frenje and Atma D. Kanojia and
Joshua Stillerman and Richard D. Petrasso",
title = "A Multithreaded Modular Software Toolkit for Control
of Complex Experiments",
journal = j-COMPUT-SCI-ENG,
volume = "15",
number = "1",
pages = "66--75",
month = jan # "\slash " # feb,
year = "2013",
DOI = "https://doi.org/10.1109/MCSE.2012.34",
ISSN = "1521-9615",
ISSN-L = "1521-9615",
bibdate = "Fri Jun 21 08:34:49 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
acknowledgement = ack-nhfb,
fjournal = "Computing in Science and Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
author = "Won So and Alexander G. Dean",
title = "Software thread integration for instruction-level
journal = j-TECS,
volume = "13",
number = "1",
pages = "8:1--8:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512466",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Sep 5 19:03:11 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Multimedia applications require a significantly higher
level of performance than previous workloads of
embedded systems. They have driven digital signal
processor (DSP) makers to adopt high-performance
architectures like VLIW (Very-Long Instruction Word).
Despite many efforts to exploit instruction-level
parallelism (ILP) in the application, the speed is a
fraction of what it could be, limited by the difficulty
of finding enough independent instructions to keep all
of the processor's functional units busy. This article
proposes Software Thread Integration (STI) for
instruction-level parallelism. STI is a software
technique for interleaving multiple threads of control
into a single implicitly multithreaded one. We use STI
to improve the performance on ILP processors by merging
parallel procedures into one, increasing the compiler's
scope and hence allowing it to create a more efficient
instruction schedule. Assuming the parallel procedures
are given, we define a methodology for finding the best
performing integrated procedure with a minimum
compilation time. We quantitatively estimate the
performance impact of integration, allowing various
integration scenarios to be compared and ranked via
profitability analysis. During integration of threads,
different ILP-improving code transformations are
selectively applied according to the control structure
and the ILP characteristics of the code, driven by
interactions with software pipelining. The estimated
profitability is verified and corrected by an iterative
compilation approach, compensating for possible
estimation inaccuracy. Our modeling methods combined
with limited compilation quickly find the best
integration scenario without requiring exhaustive
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840",
author = "S. Tucker Taft",
title = "Tutorial: proving safety of parallel \slash
multi-threaded programs",
journal = j-SIGADA-LETTERS,
volume = "33",
number = "3",
pages = "1--2",
month = dec,
year = "2013",
DOI = "https://doi.org/10.1145/2658982.2527285",
ISSN = "1094-3641 (print), 1557-9476 (electronic)",
ISSN-L = "1094-3641",
bibdate = "Wed Sep 3 16:38:30 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "This tutorial will introduce the attendees to analysis
and proof techniques for programs using parallelism and
multi-threading. There are no specific prerequisites,
but a familiarity with the notions of preconditions and
postconditions, aliasing, race conditions, and
deadlocks would be of value. The examples will be based
on the threading and parallelism models of Java, Ada,
and two new parallel languages, one called ParaSail [4]
and another, inspired by the verifiable SPARK[1][2]
subset of Ada, called Sparkel[3]. We will introduce the
distinction between safety and liveness properties, and
then focus primarily on techniques for the verification
of safety properties, including the absence of race
conditions and deadlocks. We will also discuss the
issue of determinism vs. non-determinism in parallel
and multi-threaded programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGADA Ada Letters",
journal-URL = "http://portal.acm.org/citation.cfm?id=J32",
remark = "HILT '13 conference proceedings.",
author = "Priyanka Tembey and Augusto Vega and Alper
Buyuktosunoglu and Dilma Da Silva and Pradip Bose",
title = "{SMT} switch: Software Mechanisms for Power Shifting",
volume = "12",
number = "2",
pages = "67--70",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Simultaneous multithreading (SMT) as a processor
design to achieve higher levels of system and
application throughput is a well-accepted and deployed
technique in most desktop and server processors. We
study the power implications of varying SMT levels
i.e., thread counts per core for various multi-threaded
applications on a real SMT multicore platform, and
introduce a novel software mechanism of changing SMT
level of a core to tune platform power. Power-shifting
policies by varying per core SMT levels for performance
benefits within a power cap are introduced. Projected
power savings (of 15\%) for a streaming parallel
benchmark can be attained using SMT-level power
shifting mechanisms.",
acknowledgement = ack-nhfb,
affiliation = "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
30332 USA.",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Multicore platforms; Operating Systems; Power
shifting; SMT",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tembey:2013:SSS",
web-of-science-categories = "Computer Science, Hardware \&
author = "Benjamin Wester and David Devecsery and Peter M. Chen
and Jason Flinn and Satish Narayanasamy",
title = "Parallelizing data race detection",
journal = j-SIGPLAN,
volume = "48",
number = "4",
pages = "27--38",
month = apr,
year = "2013",
DOI = "https://doi.org/10.1145/2499368.2451120",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:23 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Detecting data races in multithreaded programs is a
crucial part of debugging such programs, but
traditional data race detectors are too slow to use
routinely. This paper shows how to speed up race
detection by spreading the work across multiple cores.
Our strategy relies on uniparallelism, which executes
time intervals of a program (called epochs ) in
parallel to provide scalability, but executes all
threads from a single epoch on a single core to
eliminate locking overhead. We use several techniques
to make parallelization effective: dividing race
detection into three phases, predicting a subset of the
analysis state, eliminating sequential work via
transitive reduction, and reducing the work needed to
maintain multiple versions of analysis via
factorization. We demonstrate our strategy by
parallelizing a happens-before detector and a
lockset-based detector. We find that uniparallelism can
significantly speed up data race detection. With 4x the
number of cores as the original application, our
strategy speeds up the median execution time by 4.4x
for a happens-before detector and 3.3x for a lockset
race detector. Even on the same number of cores as the
conventional detectors, the ability for uniparallelism
to elide analysis locks allows it to reduce the median
overhead by 13\% for a happens-before detector and 8\%
for a lockset detector.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '13 conference proceedings.",
author = "Hongtao Yu and Hou-Jen Ko and Zhiyuan Li",
title = "General data structure expansion for multi-threading",
journal = j-SIGPLAN,
volume = "48",
number = "6",
pages = "243--252",
month = jun,
year = "2013",
DOI = "https://doi.org/10.1145/2499370.2462182",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Mon Jul 1 17:15:38 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Among techniques for parallelizing sequential codes,
privatization is a common and significant
transformation performed by both compilers and runtime
parallelizing systems. Without privatization,
repetitive updates to the same data structures often
introduce spurious data dependencies that hide the
inherent parallelism. Unfortunately, it remains a
significant challenge to compilers to automatically
privatize dynamic and recursive data structures which
appear frequently in real applications written in
languages such as C/C++. This is because such languages
lack a naming mechanism to define the address range of
a pointer-based data structure, in contrast to arrays
with explicitly declared bounds. In this paper we
present a novel solution to this difficult problem by
expanding general data structures such that memory
accesses issued from different threads to contentious
data structures are directed to different data fields.
Based on compile-time type checking and a data
dependence graph, this aggressive extension to the
traditional scalar and array expansion isolates the
address ranges among different threads, without
struggling with privatization based on thread-private
stacks, such that the targeted loop can be effectively
parallelized. With this method fully implemented in
GCC, experiments are conducted on a set of programs
from well-known benchmark suites such as Mibench,
MediaBench II and SPECint. Results show that the new
approach can lead to a high speedup when executing the
transformed code on multiple cores.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '13 conference proceedings.",
author = "Amirreza Zarrabi and Khairulmizam Samsudin and Wan
Azizun Wan Adnan",
title = "{Linux} Support for Fast Transparent General Purpose
Checkpoint\slash Restart of Multithreaded Processes in
Loadable Kernel Module",
journal = j-J-GRID-COMP,
volume = "11",
number = "2",
pages = "187--210",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1007/s10723-013-9248-5",
ISSN = "1570-7873 (print), 1572-9184 (electronic)",
ISSN-L = "1570-7873",
bibdate = "Sat Jun 22 11:03:44 MDT 2013",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1570-7873&volume=11&issue=2;
URL = "http://link.springer.com/article/10.1007/s10723-013-9248-5",
acknowledgement = ack-nhfb,
fjournal = "Journal of Grid Computing",
journal-URL = "http://link.springer.com/journal/10723",
author = "Omar Awile and Ivo F. Sbalzarini",
title = "A {Pthreads} Wrapper for {Fortran 2003}",
journal = j-TOMS,
volume = "40",
number = "3",
pages = "19:1--19:15",
month = apr,
year = "2014",
DOI = "https://doi.org/10.1145/2558889",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Mon Apr 21 17:42:14 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fortran3.bib;
abstract = "With the advent of multicore processors, numerical and
mathematical software relies on parallelism in order to
benefit from hardware performance increases. We present
the design and use of a Fortran 2003 wrapper for POSIX
threads, called forthreads. Forthreads is complete in
the sense that is provides native Fortran 2003
interfaces to all pthreads routines where possible. We
demonstrate the use and efficiency of forthreads for
SIMD parallelism and task parallelism. We present
forthreads/MPI implementations that enable hybrid
shared-/distributed-memory parallelism in Fortran 2003.
Our benchmarks show that forthreads offers performance
comparable to that of OpenMP, but better thread control
and more freedom. We demonstrate the latter by
presenting a multithreaded Fortran 2003 library for
POSIX Internet sockets, enabling interactive numerical
simulations with runtime control.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Davide B. Bartolini and Filippo Sironi and Donatella
Sciuto and Marco D. Santambrogio",
title = "Automated Fine-Grained {CPU} Provisioning for Virtual
journal = j-TACO,
volume = "11",
number = "3",
pages = "27:1--27:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637480",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Ideally, the pay-as-you-go model of Infrastructure as
a Service (IaaS) clouds should enable users to rent
just enough resources (e.g., CPU or memory bandwidth)
to fulfill their service level objectives (SLOs).
Achieving this goal is hard on current IaaS offers,
which require users to explicitly specify the amount of
resources to reserve; this requirement is nontrivial
for users, because estimating the amount of resources
needed to attain application-level SLOs is often
complex, especially when resources are virtualized and
the service provider colocates virtual machines (VMs)
on host nodes. For this reason, users who deploy VMs
subject to SLOs are usually prone to overprovisioning
resources, thus resulting in inflated business costs.
This article tackles this issue with AutoPro: a runtime
system that enhances IaaS clouds with automated and
fine-grained resource provisioning based on performance
SLOs. Our main contribution with AutoPro is filling the
gap between application-level performance SLOs and
allocation of a contended resource, without requiring
explicit reservations from users. In this article, we
focus on CPU bandwidth allocation to throughput-driven,
compute-intensive multithreaded applications colocated
on a multicore processor; we show that a theoretically
sound, yet simple, control strategy can enable
automated fine-grained allocation of this contended
resource, without the need for offline profiling.
Additionally, AutoPro helps service providers optimize
infrastructure utilization by provisioning idle
resources to best-effort workloads, so as to maximize
node-level utilization. Our extensive experimental
evaluation confirms that AutoPro is able to
automatically determine and enforce allocations to meet
performance SLOs while maximizing node-level
utilization by supporting batch workloads on a
best-effort basis.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Tom Bergan and Dan Grossman and Luis Ceze",
title = "Symbolic execution of multithreaded programs from
arbitrary program contexts",
journal = j-SIGPLAN,
volume = "49",
number = "10",
pages = "491--506",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2714064.2660200",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:21 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We describe an algorithm to perform symbolic execution
of a multithreaded program starting from an arbitrary
program context. We argue that this can enable more
efficient symbolic exploration of deep code paths in
multithreaded programs by allowing the symbolic engine
to jump directly to program contexts of interest. The
key challenge is modeling the initial context with
reasonable precision --- an overly approximate model
leads to exploration of many infeasible paths during
symbolic execution, while a very precise model would be
so expensive to compute that computing it would defeat
the purpose of jumping directly to the initial context
in the first place. We propose a context-specific
dataflow analysis that approximates the initial context
cheaply, but precisely enough to avoid some common
causes of infeasible-path explosion. This model is
necessarily approximate --- it may leave portions of
the memory state unconstrained, leaving our symbolic
execution unable to answer simple questions such as
``which thread holds lock A?''. For such cases, we
describe a novel algorithm for evaluating symbolic
synchronization during symbolic execution. Our symbolic
execution semantics are sound and complete up to the
limits of the underlying SMT solver. We describe
initial experiments on an implementation in Cloud 9.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '14 conference proceedings.",
author = "Shahid H. Bokhari and {\"U}mit V. {\c{C}}ataly{\"u}rek
and Metin N. Gurcan",
title = "Massively multithreaded maxflow for image segmentation
on the {Cray XMT-2}",
journal = j-CCPE,
volume = "26",
number = "18",
pages = "2836--2855",
day = "25",
month = dec,
year = "2014",
DOI = "https://doi.org/10.1002/cpe.3181",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Wed Feb 11 22:34:11 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "5 Dec 2013",
author = "Paul N. Butcher",
title = "Seven concurrency models in seven weeks: when threads
publisher = "The Pragmatic Bookshelf",
address = "Dallas, TX, USA",
pages = "xiii + 275",
year = "2014",
ISBN = "1-937785-65-3 (paperback), 1-941222-27-7 (e-book)",
ISBN-13 = "978-1-937785-65-9 (paperback), 978-1-941222-27-0
LCCN = "QA76.642 .B88 2014",
bibdate = "Thu Dec 4 13:32:20 MST 2014",
bibsource = "fsz3950.oclc.org:210/WorldCat;
series = "The Pragmatic Programmers",
URL = "http://proquest.safaribooksonline.com/?fpi=9781941222737",
acknowledgement = ack-nhfb,
subject = "Computer multitasking; Parallel programming (Computer
science); Nebenl{\"a}ufigkeit; Parallelverarbeitung",
tableofcontents = "Introduction \\
Threads and locks \\
Functional programming \\
The Clojure way: separating identity from state \\
Actors \\
Communicating sequential processes \\
Data parallelism \\
The Lambda Architecture \\
Wrapping up",
author = "Y. Cai and W. K. Chan",
title = "{Magiclock}: Scalable Detection of Potential Deadlocks
in Large-Scale Multithreaded Programs",
volume = "40",
number = "3",
pages = "266--281",
month = mar,
year = "2014",
DOI = "https://doi.org/10.1109/TSE.2014.2301725",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 19:49:24 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6718069",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "N{\'e}stor Cata{\~n}o and Ijaz Ahmed and Radu I.
Siminiceanu and Jonathan Aldrich",
title = "A case study on the lightweight verification of a
multi-threaded task server",
volume = "80",
number = "??",
pages = "169--187",
day = "1",
month = feb,
year = "2014",
ISSN = "0167-6423 (print), 1872-7964 (electronic)",
ISSN-L = "0167-6423",
bibdate = "Sat Nov 30 15:06:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167642313000178",
acknowledgement = ack-nhfb,
fjournal = "Science of Computer Programming",
journal-URL = "http://www.sciencedirect.com/science/journal/01676423",
author = "Hao Che and Minh Nguyen",
title = "{Amdahl's Law} for multithreaded multicore
journal = j-J-PAR-DIST-COMP,
volume = "74",
number = "10",
pages = "3056--3069",
month = oct,
year = "2014",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Aug 21 16:26:06 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731514001142",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315/",
author = "Florian David and Gael Thomas and Julia Lawall and
Gilles Muller",
title = "Continuously measuring critical section pressure with
the free-lunch profiler",
journal = j-SIGPLAN,
volume = "49",
number = "10",
pages = "291--307",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2714064.2660210",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:21 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Today, Java is regularly used to implement large
multi-threaded server-class applications that use locks
to protect access to shared data. However,
understanding the impact of locks on the performance of
a system is complex, and thus the use of locks can
impede the progress of threads on configurations that
were not anticipated by the developer, during specific
phases of the execution. In this paper, we propose Free
Lunch, a new lock profiler for Java application
servers, specifically designed to identify, in-vivo,
phases where the progress of the threads is impeded by
a lock. Free Lunch is designed around a new metric,
critical section pressure (CSP), which directly
correlates the progress of the threads to each of the
locks. Using Free Lunch, we have identified phases of
high CSP, which were hidden with other lock profilers,
in the distributed Cassandra NoSQL database and in
several applications from the DaCapo 9.12, the
SPECjvm2008 and the SPECjbb2005 benchmark suites. Our
evaluation of Free Lunch shows that its overhead is
never greater than 6\%, making it suitable for in-vivo
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '14 conference proceedings.",
author = "Javier Esparza and Pierre Ganty and Tom{\'a}s Poch",
title = "Pattern-Based Verification for Multithreaded
journal = j-TOPLAS,
volume = "36",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2014",
DOI = "https://doi.org/10.1145/2629644",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Oct 28 17:06:29 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Pattern-based verification checks the correctness of
program executions that follow a given pattern, a
regular expression over the alphabet of program
transitions of the form $ w_1^*, \ldots {},_n^* $ w.
For multithreaded programs, the alphabet of the pattern
is given by the reads and writes to the shared storage.
We study the complexity of pattern-based verification
for multithreaded programs with shared counters and
finite variables. While unrestricted verification is
undecidable for abstracted multithreaded programs with
recursive procedures and PSPACE-complete for abstracted
multithreaded while-programs (even without counters),
we show that pattern-based verification is NP-complete
for both classes, even in the presence of counters. We
then conduct a multiparameter analysis to study the
complexity of the problem on its three natural
parameters (number of threads+counters+variables,
maximal size of a thread, size of the pattern) and on
two parameters related to thread structure (maximal
number of procedures per thread and longest simple path
of procedure calls). We present an algorithm that for a
fixed number of threads, counters, variables, and
pattern size solves the verification problem in $ {\rm
st}^{O ({\rm lsp} + \lceil log ({\rm pr} + 1) \rceil)}
$ time, where $ {\rm st} $ is the maximal size of a
thread, $ {\rm pr} $ is the maximal number of
procedures per thread, and $ {\rm lsp} $ is the longest
simple path of procedure calls.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Restating the Case for Weighted-{IPC} Metrics to
Evaluate Multiprogram Workload Performance",
volume = "13",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Weighted speedup is nowadays the most commonly used
multiprogram workload performance metric. Weighted
speedup is a weighted-IPC metric, i.e., the
multiprogram IPC of each program is first weighted with
its isolated IPC. Recently, Michaud questions the
validity of weighted-IPC metrics by arguing that they
are inconsistent and that weighted speedup favors
unfairness [4]. Instead, he advocates using the
arithmetic or harmonic mean of the raw IPC values of
the programs in the multiprogram workload. We show that
weighted-IPC metrics are not inconsistent, and that
weighted speedup is fair in giving equal importance to
each program. We argue that, in contrast to raw-IPC
metrics, weighted-IPC metrics have a system-level
meaning, and that raw-IPC metrics are affected by the
inherent behavior of the programs. We also show that
the choice of a metric may adversely affect the
conclusions from an experiment. We suggest to use two
weighted-IPC metrics-system throughput (STP) and
average normalized turnaround time (ANTT)-for
evaluating multiprogram workload performance, and to
avoid raw-IPC metrics.",
acknowledgement = ack-nhfb,
affiliation = "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
B-9000 Ghent, Belgium.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Research Foundation --- Flanders (FWO);
European Research Council under the European Community
funding-text = "Stijn Eyerman is supported through a postdoctoral
fellowship by the Research Foundation --- Flanders
(FWO). Additional support is provided by the European
Research Council under the European Community's Seventh
Framework Programme (FP7/2007-2013) / ERC Grant
agreement no. 259295.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ANTT; average normalized turnaround time; Benchmark
testing; C Computer Systems Organization; C.1 Processor
Architectures; C.1.3 Other Architecture Styles; C.1.3.h
Multithreaded processors; C.1.4 Parallel Architectures;
C.1.4.e Multi-core/single-chip multiprocessors; C.4
Performance of Systems; C.4.c Measurement techniques;
Degradation; Harmonic analysis; harmonic mean;
Multicore processing; multiprocessing systems;
multiprogram IPC; multiprogram workload performance
metric; multiprogramming; raw-IPC metrics; STP; system
throughput; system-level meaning; Throughput; Weight
measurement; weighted speedup; weighted-IPC metric",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Eyerman:2014:RCW",
web-of-science-categories = "Computer Science, Hardware \&
author = "Diego Fabregat-Traver and Yurii S. Aulchenko and Paolo
title = "Solving sequences of generalized least-squares
problems on multi-threaded architectures",
journal = j-APPL-MATH-COMP,
volume = "234",
number = "??",
pages = "606--617",
day = "15",
month = may,
year = "2014",
ISSN = "0096-3003 (print), 1873-5649 (electronic)",
ISSN-L = "0096-3003",
bibdate = "Mon Apr 21 18:04:13 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0096300314002951",
acknowledgement = ack-nhfb,
fjournal = "Applied Mathematics and Computation",
journal-URL = "http://www.sciencedirect.com/science/journal/00963003/",
author = "Marc E. Frincu and St{\'e}phane Genaud and Julien
title = "On the efficiency of several {VM} provisioning
strategies for workflows with multi-threaded tasks on
journal = j-COMPUTING,
volume = "96",
number = "11",
pages = "1059--1086",
month = nov,
year = "2014",
DOI = "https://doi.org/10.1007/s00607-014-0410-0",
ISSN = "0010-485X (print), 1436-5057 (electronic)",
ISSN-L = "0010-485X",
bibdate = "Wed Feb 11 07:42:25 MST 2015",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=11;
URL = "http://link.springer.com/article/10.1007/s00607-014-0410-0",
acknowledgement = ack-nhfb,
fjournal = "Computing",
journal-URL = "http://link.springer.com/journal/607",
author = "Prodromos Gerakios and Nikolaos Papaspyrou and
Konstantinos Sagonas",
title = "Static safety guarantees for a low-level multithreaded
language with regions",
volume = "80",
number = "??",
pages = "223--263",
day = "1",
month = feb,
year = "2014",
ISSN = "0167-6423 (print), 1872-7964 (electronic)",
ISSN-L = "0167-6423",
bibdate = "Sat Nov 30 15:06:20 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167642313001433",
acknowledgement = ack-nhfb,
fjournal = "Science of Computer Programming",
journal-URL = "http://www.sciencedirect.com/science/journal/01676423",
author = "Jana Giceva and Gustavo Alonso and Timothy Roscoe and
Tim Harris",
title = "Deployment of query plans on multicores",
volume = "8",
number = "3",
pages = "233--244",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Efficient resource scheduling of multithreaded
software on multicore hardware is difficult given the
many parameters involved and the hardware heterogeneity
of existing systems. In this paper we explore the
efficient deployment of query plans over a multicore
machine. We focus on shared query systems, and
implement the proposed ideas using SharedDB. The goal
of the paper is to explore how to deliver maximum
performance and predictability, while minimizing
resource utilization when deploying query plans on
multicore machines. We propose to use resource activity
vectors to characterize the behavior of individual
database operators. We then present a novel deployment
algorithm which uses these vectors together with
dataflow information from the query plan to optimally
assign relational operators to physical cores.
Experiments demonstrate that this approach
significantly reduces resource requirements while
preserving performance and is robust across different
server architectures.",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
Zapata and Oscar Plata",
title = "Effective Transactional Memory Execution Management
for Improved Concurrency",
journal = j-TACO,
volume = "11",
number = "3",
pages = "24:1--24:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2633048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This article describes a transactional memory
execution model intended to exploit maximum parallelism
from sequential and multithreaded programs. A program
code section is partitioned into chunks that will be
mapped onto threads and executed transactionally. These
transactions run concurrently and out of order, trying
to exploit maximum parallelism but managed by a
specific fully distributed commit control to meet data
dependencies. To accomplish correct parallel execution,
a partial precedence order relation is derived from the
program code section and/or defined by the programmer.
When a conflict between chunks is eagerly detected, the
precedence order relation is used to determine the best
policy to solve the conflict that preserves the
precedence order while maximizing concurrency. The
model defines a new transactional state called executed
but not committed. This state allows exploiting
concurrency on two levels: intrathread and interthread.
Intrathread concurrency is improved by having pending
uncommitted transactions while executing a new one in
the same thread. The new state improves interthread
concurrency because it permits out-of-order transaction
commits regarding the precedence order. Our model has
been implemented in a lightweight software
transactional memory system, TinySTM, and has been
evaluated on a set of benchmarks obtaining an important
performance improvement over the baseline TM system.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "P. H. Guzzi and G. Agapito and M. Cannataro",
title = "{coreSNP}: Parallel Processing of Microarray Data",
journal = j-IEEE-TRANS-COMPUT,
volume = "63",
number = "12",
pages = "2961--2974",
month = dec,
year = "2014",
DOI = "https://doi.org/10.1109/TC.2013.176",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Dec 4 10:36:57 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
keywords = "Affymetrix; bioinformatics; Bioinformatics;
Bioinformatics (genome or protein) databases; coreSNP
parallel software tool; distributed programming;
distributed systems; DMET SNP microarray data; DNA;
Drug Metabolism Enzymes and Transporters; drug
response; drug therapy improvement; drug toxicity;
Drugs; drugs; enzymes; experimental data analysis;
experimental data preprocessing; experimental data
storage; gene expression; genetic variation; genetics;
Genomics; genomics; genomics diffusion; graphical user
interface; graphical user interfaces; health care;
healthcare; high-throughput technologies; information
retrieval; lab-on-a-chip; maximum drug efficacy;
medical information systems; microarray data; minimal
adverse effects; multi-threading; next generation
sequencing; parallel processing; Parallel processing;
patient genotype; performance evaluation;
pharmacogenomics analysis pipeline; response times;
scalable multithreaded implementation;
single-nucleotide polymorphisms; SNP annotation;
statistical analysis; Statistical analysis; statistical
software; Throughput",
author = "Christopher M. Hayden and Karla Saur and Edward K.
Smith and Michael Hicks and Jeffrey S. Foster",
title = "{Kitsune}: Efficient, General-Purpose Dynamic Software
Updating for {C}",
journal = j-TOPLAS,
volume = "36",
number = "4",
pages = "13:1--13:??",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2629460",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Oct 28 17:05:40 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Dynamic software updating (DSU) systems facilitate
software updates to running programs, thereby
permitting developers to add features and fix bugs
without downtime. This article introduces Kitsune, a
DSU system for C. Kitsune's design has three notable
features. First, Kitsune updates the whole program,
rather than individual functions, using a mechanism
that places no restrictions on data representations or
allowed compiler optimizations. Second, Kitsune makes
the important aspects of updating explicit in the
program text, making the program's semantics easy to
understand while minimizing programmer effort. Finally,
the programmer can write simple specifications to
direct Kitsune to generate code that traverses and
transforms old-version state for use by new code; such
state transformation is often necessary and is
significantly more difficult in prior DSU systems. We
have used Kitsune to update six popular, open-source,
single- and multithreaded programs and find that few
program changes are required to use Kitsune, that it
incurs essentially no performance overhead, and that
update times are fast.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Nima Honarmand and Josep Torrellas",
title = "{RelaxReplay}: record and replay for
relaxed-consistency multiprocessors",
journal = j-COMP-ARCH-NEWS,
volume = "42",
number = "1",
pages = "223--238",
month = mar,
year = "2014",
DOI = "https://doi.org/10.1145/2654822.2541979",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Mon Aug 18 17:12:47 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Record and Deterministic Replay (RnR) of multithreaded
programs on relaxed-consistency multiprocessors has
been a long-standing problem. While there are designs
that work for Total Store Ordering (TSO), finding a
general solution that is able to record the access
reordering allowed by any relaxed-consistency model has
proved challenging. This paper presents the first
complete solution for hard-ware-assisted memory race
recording that works for any relaxed-consistency model
of current processors. With the scheme, called
RelaxReplay, we can build an RnR system for any
relaxed-consistency model and coherence protocol.
RelaxReplay's core innovation is a new way of capturing
memory access reordering. Each memory instruction goes
through a post-completion in-order counting step that
detects any reordering, and efficiently records it. We
evaluate RelaxReplay with simulations of an 8-core
release-consistent multicore running SPLASH-2 programs.
We observe that RelaxReplay induces negligible overhead
during recording. In addition, the average size of the
log produced is comparable to the log sizes reported
for existing solutions, and still very small compared
to the memory bandwidth of modern machines. Finally,
deterministic replay is efficient and needs minimal
hardware support.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ASPLOS '14 conference proceedings.",
author = "Alexander Kaiser and Daniel Kroening and Thomas Wahl",
title = "A Widening Approach to Multithreaded Program
journal = j-TOPLAS,
volume = "36",
number = "4",
pages = "14:1--14:??",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2629608",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Oct 28 17:05:40 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Pthread-style multithreaded programs feature rich
thread communication mechanisms, such as shared
variables, signals, and broadcasts. In this article, we
consider the automated verification of such programs
where an unknown number of threads execute a given
finite-data procedure in parallel. Such procedures are
typically obtained as predicate abstractions of
recursion-free source code written in C or Java. Many
safety problems over finite-data replicated
multithreaded programs are decidable via a reduction to
the coverability problem in certain types of
well-ordered infinite-state transition systems. On the
other hand, in full generality, this problem is
Ackermann-hard, which seems to rule out efficient
algorithmic treatment. We present a novel, sound, and
complete yet empirically efficient solution. Our
approach is to judiciously widen the original set of
coverability targets by configurations that involve
fewer threads and are thus easier to decide, and whose
exploration may well be sufficient: if they turn out
uncoverable, so are the original targets. To soften the
impact of ``bad guesses''-configurations that turn out
coverable-the exploration is accompanied by a parallel
engine that generates coverable configurations; none of
these is ever selected for widening. Its job being
merely to prevent bad widening choices, such an engine
need not be complete for coverability analysis, which
enables a range of existing partial (e.g.,
nonterminating) techniques. We present extensive
experiments on multithreaded C programs, including
device driver code from FreeBSD, Solaris, and Linux
distributions. Our approach outperforms existing
coverability methods by orders of magnitude.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "S. Kim",
title = "Synthesizing Multithreaded Code from Real-Time
Object-Oriented Models via Schedulability-Aware Thread
volume = "40",
number = "4",
pages = "413--426",
month = apr,
year = "2014",
DOI = "https://doi.org/10.1109/TSE.2013.47",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 19:49:24 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6617637",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "T. Knopp",
booktitle = "{2014 First Workshop for High Performance Technical
Computing in Dynamic Languages}",
title = "Experimental Multi-threading Support for the {Julia}
Programming Language",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "1--5",
year = "2014",
DOI = "https://doi.org/10.1109/HPTCDL.2014.11",
bibdate = "Thu Apr 8 07:17:08 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/julia.bib;
acknowledgement = ack-nhfb,
keywords = "Julia programming language",
author = "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
and Eby G. Friedman and Avinoam Kolodny and Uri C.
title = "Memristor-Based Multithreading",
volume = "13",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Switch on Event Multithreading (SoE MT, also known as
coarse-grained MT and block MT) processors run multiple
threads on a pipeline machine, while the pipeline
switches threads on stall events (e.g., cache miss).
The thread switch penalty is determined by the number
of stages in the pipeline that are flushed of in-flight
instructions. In this paper, Continuous Flow
Multithreading (CFMT), a new architecture of SoE MT, is
introduced. In CFMT, a multistate pipeline register
(MPR) holds the microarchitectural state of multiple
different threads within the execution pipeline stages,
where only one thread is active at a time. The MPRs
eliminate the need to flush in-flight instructions and
therefore significantly improve performance. In recent
years, novel memory technologies such as Resistive RAM
(RRAM) and Spin Torque Transfer Magnetoresistive RAM
(STT-MRAM), have been developed. All of these
technologies are nonvolatile, store data as resistance,
and can be described as ``{memristors.''} Memristors
are power efficient, dense, and fast as compared to
standard memory technologies such as SRAM, DRAM, and
Flash. Memristors therefore provide the opportunity to
place the MPRs physically within the pipeline stages. A
performance analysis of CFMT is compared to
conventional SoE MT processors, demonstrating up to a
2X performance improvement, while the operational
mechanism, due to the use of memristors, is low power
and low complexity as compared to conventional SoE MT
acknowledgement = ack-nhfb,
affiliation = "Kvatinsky, S (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
Weiser, Uri C., Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
Technion Israel Inst Technol, Dept Comp Sci, IL-32000
Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
Elect \& Comp Engn, Rochester, NY 14627 USA.",
author-email = "skva@tx.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Hasso Plattner Institute",
funding-text = "This work was supported by the Hasso Plattner
Institute. The authors thank Ravi Patel for his
comments and area overhead estimation and to Nimrod
Wald and Guy Satat for their help in evaluating the
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "memristor; multithreaded processors; phase change
memory; RRAM, STT-MRAM",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Kvatinsky:2014:MBM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Yong Li and R. Melhem and A. K. Jones",
title = "A Practical Data Classification Framework for Scalable
and High Performance Chip-Multiprocessors",
journal = j-IEEE-TRANS-COMPUT,
volume = "63",
number = "12",
pages = "2905--2918",
month = dec,
year = "2014",
DOI = "https://doi.org/10.1109/TC.2013.161",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Dec 4 10:36:57 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
keywords = "application-specific characteristics; Benchmark
testing; cache coherence; cache coherence design; cache
storage; chip multiprocessor; Coherence; coherence
directory; coherence overhead mitigation; coherence
traffic; compiler-assisted mechanism; compilers; data
access behavior; data access latency mitigation; data
classification; data classification scheme; Dynamic
scheduling; Instruction sets; interconnect; many-core
architectures; microarchitectural constructs;
multi-threaded parallel; NUCA-based caching; OpenMP;
Optimization; parallel applications; parallel
architectures; pattern classification; performance
evaluation; performance improvement; pipelined
parallel; Practically private; practically private;
program compilers; Resource management; Runtime;
scalable high-performance parallel systems; TLB;
ubiquitous computing",
author = "Tongping Liu and Chen Tian and Ziang Hu and Emery D.
title = "{PREDATOR}: predictive false sharing detection",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "3--14",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555244",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "False sharing is a notorious problem for multithreaded
applications that can drastically degrade both
performance and scalability. Existing approaches can
precisely identify the sources of false sharing, but
only report false sharing actually observed during
execution; they do not generalize across executions.
Because false sharing is extremely sensitive to object
layout, these detectors can easily miss false sharing
problems that can arise due to slight differences in
memory allocation order or object placement decisions
by the compiler. In addition, they cannot predict the
impact of false sharing on hardware with different
cache line sizes. This paper presents PREDATOR, a
predictive software-based false sharing detector.
PREDATOR generalizes from a single execution to
precisely predict false sharing that is latent in the
current execution. PREDATOR tracks accesses within a
range that could lead to false sharing given different
object placement. It also tracks accesses within
virtual cache lines, contiguous memory ranges that span
actual hardware cache lines, to predict sharing on
hardware platforms with larger cache line sizes. For
each, it reports the exact program location of
predicted false sharing problems, ranked by their
projected impact on performance. We evaluate PREDATOR
across a range of benchmarks and actual applications.
PREDATOR identifies problems undetectable with previous
tools, including two previously-unknown false sharing
problems, with no false positives. PREDATOR is able to
immediately locate false sharing problems in MySQL and
the Boost library that had eluded detection for
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Xu Liu and John Mellor-Crummey",
title = "A tool to analyze the performance of multithreaded
programs on {NUMA} architectures",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "259--272",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555271",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Almost all of today's microprocessors contain memory
controllers and directly attach to memory. Modern
multiprocessor systems support non-uniform memory
access (NUMA): it is faster for a microprocessor to
access memory that is directly attached than it is to
access memory attached to another processor. Without
careful distribution of computation and data, a
multithreaded program running on such a system may have
high average memory access latency. To use
multiprocessor systems efficiently, programmers need
performance tools to guide the design of NUMA-aware
codes. To address this need, we enhanced the HPCToolkit
performance tools to support measurement and analysis
of performance problems on multiprocessor systems with
multiple NUMA domains. With these extensions,
HPCToolkit helps pinpoint, quantify, and analyze NUMA
bottlenecks in executions of multithreaded programs. It
computes derived metrics to assess the severity of
bottlenecks, analyzes memory accesses, and provides a
wealth of information to guide NUMA optimization,
including information about how to distribute data to
reduce access latency and minimize contention. This
paper describes the design and implementation of our
extensions to HPCToolkit. We demonstrate their utility
by describing case studies in which we use these
capabilities to diagnose NUMA bottlenecks in four
multithreaded applications.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Bin Liu and Yinliang Zhao and Yuxiang Li and Yanjun
Sun and Boqin Feng",
title = "A thread partitioning approach for speculative
volume = "67",
number = "3",
pages = "778--805",
month = mar,
year = "2014",
DOI = "https://doi.org/10.1007/s11227-013-1000-1",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Sat Mar 8 14:59:14 MST 2014",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=67&issue=3;
URL = "http://link.springer.com/article/10.1007/s11227-013-1000-1",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Kai Lu and Xu Zhou and Tom Bergan and Xiaoping Wang",
title = "Efficient deterministic multithreading without global
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "287--300",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555252",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Multithreaded programs execute nondeterministically on
conventional architectures and operating systems. This
complicates many tasks, including debugging and
testing. Deterministic multithreading (DMT) makes the
output of a multithreaded program depend on its inputs
only, which can totally solve the above problem.
However, current DMT implementations suffer from a
common inefficiency: they use frequent global barriers
to enforce a deterministic ordering on memory accesses.
In this paper, we eliminate that inefficiency using an
execution model we call deterministic lazy release
consistency (DLRC). Our execution model uses the Kendo
algorithm to enforce a deterministic ordering on
synchronization, and it uses a deterministic version of
the lazy release consistency memory model to propagate
memory updates across threads. Our approach guarantees
that programs execute deterministically even when they
contain data races. We implemented a DMT system based
on these ideas (RFDet) and evaluated it using 16
parallel applications. Our implementation targets C/C++
programs that use POSIX threads. Results show that
RFDet gains nearly 2x speedup compared with DThreads-a
start-of-the-art DMT system.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Pallavi Maiya and Aditya Kanade and Rupak Majumdar",
title = "Race detection for {Android} applications",
journal = j-SIGPLAN,
volume = "49",
number = "6",
pages = "316--325",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2666356.2594311",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Sep 26 07:38:28 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Programming environments for smartphones expose a
concurrency model that combines multi-threading and
asynchronous event-based dispatch. While this enables
the development of efficient and feature-rich
applications, unforeseen thread interleavings coupled
with non-deterministic reorderings of asynchronous
tasks can lead to subtle concurrency errors in the
applications. In this paper, we formalize the
concurrency semantics of the Android programming model.
We further define the happens-before relation for
Android applications, and develop a dynamic race
detection technique based on this relation. Our
relation generalizes the so far independently studied
happens-before relations for multi-threaded programs
and single-threaded event-driven programs.
Additionally, our race detection technique uses a model
of the Android runtime environment to reduce false
positives. We have implemented a tool called
DroidRacer. It generates execution traces by
systematically testing Android applications and detects
data races by computing the happens-before relation on
the traces. We analyzed 15 Android applications
including popular applications such as Facebook,
Twitter and K-9 Mail. Our results indicate that data
races are prevalent in Android applications, and that
DroidRacer is an effective tool to identify data
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
received = "PLDI '14 conference proceedings.",
author = "Jan Kasper Martinsen and Hakan Grahn and Anders
title = "Heuristics for Thread-Level Speculation in {Web}
volume = "13",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "JavaScript is a sequential programming language, and
Thread-Level Speculation has been proposed to
dynamically extract parallelism in order to take
advantage of parallel hardware. In previous work, we
have showed significant speed-ups with a simple on/off
speculation heuristic. In this paper, we propose and
evaluate three heuristics for dynamically adapt the
speculation: a 2-bit heuristic, an exponential
heuristic, and a combination of these two. Our results
show that the combined heuristic is able to both
increase the number of successful speculations and
decrease the execution time for 15 popular web
acknowledgement = ack-nhfb,
affiliation = "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
Mobile Commun AB, SE-22188 Lund, Sweden.",
author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Industrial Excellence Center EASE -
Embedded Applications Software Engineering; BESQ+
research project --- Knowledge Foundation in Sweden
funding-text = "This work was partly funded by the Industrial
Excellence Center EASE --- Embedded Applications
Software Engineering, (http://ease.cs.lth.se), and the
BESQ+ research project funded by the Knowledge
Foundation (grant number 20100311) in Sweden.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2-bit heuristic; Automatic Parallelization; Benchmark
testing; C.1.4 Parallel Architectures; C.1.4.f
Speculative multi-threading; exponential heuristic;
Instruction sets; Internet; Java; JavaScript; Multicore
processors; Multithreading; Parallel Computing;
parallel hardware; Parallel processing; parallel
programming; sequential programming language; Social
network services; thread-level speculation; Web
number-of-cited-references = "12",
oa = "Green Published",
ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Martinsen:2014:HTL",
web-of-science-categories = "Computer Science, Hardware \&
author = "Shin Morishima and Hiroki Matsutani",
title = "Performance Evaluations of Graph Database using {CUDA}
and {OpenMP} Compatible Libraries",
journal = j-COMP-ARCH-NEWS,
volume = "42",
number = "4",
pages = "75--80",
year = "2014",
DOI = "https://doi.org/10.1145/2693714.2693728",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Dec 3 16:18:50 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Graph databases use graph structures to store data
sets as nodes, edges, and properties. They are used to
store and search the relationships between a large
number of nodes, such as social networking services and
recommendation engines that use customer social graphs.
Since computation cost for graph search queries
increases as the graph becomes large, in this paper we
accelerate the graph search functions (Dijkstra and A*
algorithms) of a graph database Neo4j using two ways:
multithreaded library and CUDA library for graphics
processing units (GPUs). We use 100,000-node graphs
generated based on a degree distribution of Facebook
social graph for evaluations. Our multi-threaded and
GPU-based implementations require an auxiliary
adjacency matrix for a target graph. The results show
that, when we do not take into account additional
overhead to generate the auxiliary adjacency matrix,
multi-threaded version improves the Dijkstra and A*
search performance by 16.2x and 13.8x compared to the
original implementation. The GPU-based implementation
improves the Dijkstra and A* search performance by
26.2x and 32.8x. When we take into account the
overhead, although the speed-ups by our implementations
are reduced, by reusing the auxiliary adjacency matrix
for multiple graph search queries we can significantly
improve the graph search performance.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "HEART '14 conference proceedings.",
author = "Hamid Mushtaq and Zaid Al-Ars and Koen Bertels",
title = "Efficient and highly portable deterministic
multithreading {(DetLock)}",
journal = j-COMPUTING,
volume = "96",
number = "12",
pages = "1131--1147",
month = dec,
year = "2014",
DOI = "https://doi.org/10.1007/s00607-013-0370-9",
ISSN = "0010-485X (print), 1436-5057 (electronic)",
ISSN-L = "0010-485X",
bibdate = "Wed Feb 11 07:42:26 MST 2015",
bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=12;
URL = "http://link.springer.com/article/10.1007/s00607-013-0370-9",
acknowledgement = ack-nhfb,
fjournal = "Computing",
journal-URL = "http://link.springer.com/journal/607",
author = "Tri Minh Ngo and Mari{\"e}lle Stoelinga and Marieke
title = "Effective verification of confidentiality for
multi-threaded programs",
journal = j-J-COMP-SECUR,
volume = "22",
number = "2",
pages = "269--300",
month = "????",
year = "2014",
DOI = "https://doi.org/10.3233/JCS-130492",
ISSN = "0926-227X (print), 1875-8924 (electronic)",
ISSN-L = "0926-227X",
bibdate = "Tue May 24 06:26:12 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computer Security",
journal-URL = "http://content.iospress.com/journals/journal-of-computer-security",
author = "Artur Niewiadomski and Jaroslaw Skaruz and Wojciech
Penczek and Maciej Szreter and Mariusz Jarocki",
title = "{SMT} Versus Genetic and {OpenOpt} Algorithms:
Concrete Planning in the {PlanICS} Framework",
journal = j-FUND-INFO,
volume = "135",
number = "4",
pages = "451--466",
month = oct,
year = "2014",
DOI = "https://doi.org/10.3233/FI-2014-1134",
ISSN = "0169-2968 (print), 1875-8681 (electronic)",
ISSN-L = "0169-2968",
bibdate = "Sat Mar 5 17:20:06 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2010.bib;
acknowledgement = ack-nhfb,
fjournal = "Fundamenta Informaticae",
journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae",
author = "Ben Niu and Gang Tan",
title = "Modular control-flow integrity",
journal = j-SIGPLAN,
volume = "49",
number = "6",
pages = "577--587",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2666356.2594295",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Sep 26 07:38:28 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Control-Flow Integrity (CFI) is a software-hardening
technique. It inlines checks into a program so that its
execution always follows a predetermined Control-Flow
Graph (CFG). As a result, CFI is effective at
preventing control-flow hijacking attacks. However,
past fine-grained CFI implementations do not support
separate compilation, which hinders its adoption. We
present Modular Control-Flow Integrity (MCFI), a new
CFI technique that supports separate compilation. MCFI
allows modules to be independently instrumented and
linked statically or dynamically. The combined module
enforces a CFG that is a combination of the individual
modules' CFGs. One challenge in supporting dynamic
linking in multithreaded code is how to ensure a safe
transition from the old CFG to the new CFG when
libraries are dynamically linked. The key technique we
use is to have the CFG represented in a runtime data
structure and have reads and updates of the data
structure wrapped in transactions to ensure thread
safety. Our evaluation on SPECCPU2006 benchmarks shows
that MCFI supports separate compilation, incurs low
overhead of around 5\%, and enhances security.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
received = "PLDI '14 conference proceedings.",
author = "Rei Odaira and Jose G. Castanos and Hisanobu Tomari",
title = "Eliminating global interpreter locks in {Ruby} through
hardware transactional memory",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "131--142",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555247",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Many scripting languages use a Global Interpreter Lock
(GIL) to simplify the internal designs of their
interpreters, but this kind of lock severely lowers the
multi-thread performance on multi-core machines. This
paper presents our first results eliminating the GIL in
Ruby using Hardware Transactional Memory (HTM) in the
IBM zEnterprise EC12 and Intel 4th Generation Core
processors. Though prior prototypes replaced a GIL with
HTM, we tested realistic programs, the Ruby NAS
Parallel Benchmarks (NPB), the WEBrick HTTP server, and
Ruby on Rails. We devised a new technique to
dynamically adjust the transaction lengths on a
per-bytecode basis, so that we can optimize the
likelihood of transaction aborts against the relative
overhead of the instructions to begin and end the
transactions. Our results show that HTM achieved 1.9-
to 4.4-fold speedups in the NPB programs over the GIL
with 12 threads, and 1.6- and 1.2-fold speedups in
WEBrick and Ruby on Rails, respectively. The dynamic
transaction-length adjustment chose the best
transaction lengths for any number of threads and
applications with sufficiently long running times.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Achille Peternier and Danilo Ansaloni and Daniele
Bonetta and Cesare Pautasso and Walter Binder",
title = "Improving execution unit occupancy on {SMT}-based
processors through hardware-aware thread scheduling",
journal = j-FUT-GEN-COMP-SYS,
volume = "30",
number = "??",
pages = "229--241",
month = jan,
year = "2014",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Mon Dec 2 16:57:46 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167739X13001295",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
title = "Leveraging hardware message passing for efficient
thread synchronization",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "143--154",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555251",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "As the level of parallelism in manycore processors
keeps increasing, providing efficient mechanisms for
thread synchronization in concurrent programs is
becoming a major concern. On cache-coherent
shared-memory processors, synchronization efficiency is
ultimately limited by the performance of the underlying
cache coherence protocol. This paper studies how
hardware support for message passing can improve
synchronization performance. Considering the ubiquitous
problem of mutual exclusion, we adapt two
state-of-the-art solutions used on shared-memory
processors, namely the server approach and the
combining approach, to leverage the potential of
hardware message passing. We propose HybComb, a novel
combining algorithm that uses both message passing and
shared memory features of emerging hybrid processors.
We also introduce MP-Server, a straightforward
adaptation of the server approach to hardware message
passing. Evaluation on Tilera's TILE-Gx processor shows
that MP-Server can execute contended critical sections
with unprecedented throughput, as stalls related to
cache coherence are removed from the critical path.
HybComb can achieve comparable performance, while
avoiding the need to dedicate server cores.
Consequently, our queue and stack implementations,
based on MP-Server and HybComb, largely outperform
their most efficient pure-shared-memory counterparts.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "M. Pricopi and T. Mitra",
title = "Task Scheduling on Adaptive Multi-Core",
journal = j-IEEE-TRANS-COMPUT,
volume = "63",
number = "10",
pages = "2590--2603",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1109/TC.2013.115",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Nov 06 07:29:34 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
keywords = "adaptive architectures; adaptive multi-cores; adaptive
multicore architectures; core allocation; dynamic
heterogeneous multi-core; embedded domain;
general-purpose computing; ILP; instruction-level
parallelism; malleable and moldable tasks;
multi-threading; offline scheduler; on-chip cores;
online scheduler; parallel applications; parallel
architectures; power constraints; resource allocation;
resource allocation problems; Scheduling; scheduling;
sequential application; sequential code; sequential
fragments; task scheduling; thermal constraints;
thread-level parallelism; TLP",
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi
Narayan Bhuyan",
title = "Lock contention aware thread migrations",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "369--370",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555273",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "On a cache-coherent multicore multiprocessor system,
the performance of a multithreaded application with
high lock contention is very sensitive to the
distribution of application threads across multiple
processors. This is because the distribution of threads
impacts the frequency of lock transfers between
processors, which in turn impacts the frequency of
last-level cache (LLC) misses that lie on the critical
path of execution. Inappropriate distribution of
threads across processors increases LLC misses in the
critical path and significantly degrades performance of
multithreaded programs. To alleviate the above problem,
this paper overviews a thread migration technique,
which migrates threads of a multithreaded program
across multicore processors so that threads seeking
locks are more likely to find the locks on the same
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Xuehai Qian and Benjamin Sahelices and Depei Qian",
title = "{Pacifier}: record and replay for relaxed-consistency
multiprocessors with distributed directory protocol",
journal = j-COMP-ARCH-NEWS,
volume = "42",
number = "3",
pages = "433--444",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2678373.2665736",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Dec 3 16:18:50 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Record and Deterministic Replay (R\&R) of
multithreaded programs on relaxed-consistency
multiprocessors with distributed directory protocol has
been a long-standing open problem. The independently
developed RelaxReplay [8] solves the problem by
assuming write atomicity. This paper proposes Pacifier,
the first R\&R scheme to provide a solution without
assuming write atomicity. R\&R for relaxed-consistency
multiprocessors needs to detect, record and replay
Sequential Consistency Violations (SCV). Pacifier has
two key components: (i) Relog, a general memory
reordering logging and replay mechanism that can
reproduce SCVs in relaxed memory models, and (ii)
Granule, an SCV detection scheme in the record phase
with good precision, that indicates whether to record
with Relog. We show that Pacifier is a sweet spot in
the design space with a reasonable trade-off between
hardware and log overhead. An evaluation with
simulations of 16, 32 and 64 processors with Release
Consistency (RC) running SPLASH-2 applications
indicates that Pacifier incurs 3.9\% ~ 16\% larger
logs. The slowdown of Pacifier during replay is 10.1\%
~ 30.5\% compared to native execution",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ISCA '14 conference proceedings.",
author = "Musfiq Rahman and Bruce R. Childers and Sangyeun Cho",
title = "{COMeT+}: Continuous Online Memory Testing with
Multi-Threading Extension",
journal = j-IEEE-TRANS-COMPUT,
volume = "63",
number = "7",
pages = "1668--1681",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1109/TC.2013.65",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Mon Aug 25 08:24:32 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Haris Ribic and Yu David Liu",
title = "Energy-efficient work-stealing language runtimes",
journal = j-COMP-ARCH-NEWS,
volume = "42",
number = "1",
pages = "513--528",
month = mar,
year = "2014",
DOI = "https://doi.org/10.1145/2654822.2541971",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Mon Aug 18 17:12:47 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Work stealing is a promising approach to constructing
multithreaded program runtimes of parallel programming
languages. This paper presents HERMES, an
energy-efficient work-stealing language runtime. The
key insight is that threads in a work-stealing
environment --- thieves and victims --- have varying
impacts on the overall program running time, and a
coordination of their execution ``tempo'' can lead to
energy efficiency with minimal performance loss. The
centerpiece of HERMES is two complementary algorithms
to coordinate thread tempo: the workpath-sensitive
algorithm determines tempo for each thread based on
thief-victim relationships on the execution path,
whereas the workload-sensitive algorithm selects
appropriate tempo based on the size of work-stealing
deques. We construct HERMES on top of Intel Cilk Plus's
runtime, and implement tempo adjustment through
standard Dynamic Voltage and Frequency Scaling (DVFS).
Benchmarks running on HERMES demonstrate an average of
11-12\% energy savings with an average of 3-4\%
performance loss through meter-based measurements over
commercial CPUs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ASPLOS '14 conference proceedings.",
author = "Timothy G. Rogers and Mike O'Connor and Tor M.
title = "Learning your limit: managing massively multithreaded
caches through scheduling",
journal = j-CACM,
volume = "57",
number = "12",
pages = "91--98",
month = dec,
year = "2014",
DOI = "https://doi.org/10.1145/2682583",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Thu Jan 22 08:42:40 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/cacm/;
URL = "http://cacm.acm.org/magazines/2014/12/180789/fulltext",
abstract = "The gap between processor and memory performance has
become a focal point for microprocessor research and
development over the past three decades. Modern
architectures use two orthogonal approaches to help
alleviate this issue: (1) Almost every microprocessor
includes some form of on-chip storage, usually in the
form of caches, to decrease memory latency and make
more effective use of limited memory bandwidth. (2)
Massively multithreaded architectures, such as graphics
processing units (GPUs), attempt to hide the high
latency to memory by rapidly switching between many
threads directly in hardware. This paper explores the
intersection of these two techniques. We study the
effect of accelerating highly parallel workloads with
significant locality on a massively multithreaded GPU.
We observe that the memory access stream seen by
on-chip caches is the direct result of decisions made
by the hardware thread scheduler. Our work proposes a
hardware scheduling technique that reacts to feedback
from the memory system to create a more cache-friendly
access stream. We evaluate our technique using
simulations and show a significant performance
improvement over previously proposed scheduling
mechanisms. We demonstrate the effectiveness of
scheduling as a cache management technique by comparing
cache hit rate using our scheduler and an LRU
replacement policy against other scheduling techniques
using an optimal cache replacement policy.",
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "Malavika Samak and Murali Krishna Ramanathan",
title = "Multithreaded test synthesis for deadlock detection",
journal = j-SIGPLAN,
volume = "49",
number = "10",
pages = "473--489",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2714064.2660238",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:21 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Designing and implementing thread-safe multithreaded
libraries can be a daunting task as developers of these
libraries need to ensure that their implementations are
free from concurrency bugs, including deadlocks. The
usual practice involves employing software testing
and/or dynamic analysis to detect deadlocks. Their
effectiveness is dependent on well-designed
multithreaded test cases. Unsurprisingly, developing
multithreaded tests is significantly harder than
developing sequential tests for obvious reasons. In
this paper, we address the problem of automatically
synthesizing multithreaded tests that can induce
deadlocks. The key insight to our approach is that a
subset of the properties observed when a deadlock
manifests in a concurrent execution can also be
observed in a single threaded execution. We design a
novel, automatic, scalable and directed approach that
identifies these properties and synthesizes a deadlock
revealing multithreaded test. The input to our approach
is the library implementation under consideration and
the output is a set of deadlock revealing multithreaded
tests. We have implemented our approach as part of a
tool, named OMEN$^1$. OMEN is able to synthesize
multithreaded tests on many multithreaded Java
libraries. Applying a dynamic deadlock detector on the
execution of the synthesized tests results in the
detection of a number of deadlocks, including 35 real
deadlocks in classes documented as thread-safe.
Moreover, our experimental results show that dynamic
analysis on multithreaded tests that are either
synthesized randomly or developed by third-party
programmers are ineffective in detecting the
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '14 conference proceedings.",
editor = "Herbert Schildt",
title = "{Java}: The Complete Reference",
publisher = pub-MCGRAW-HILL,
address = pub-MCGRAW-HILL:adr,
edition = "Ninth",
pages = "xxxiv + 1274",
year = "2014",
ISBN = "0-07-180855-8 (paperback), 0-07-180925-2,
ISBN-13 = "978-0-07-180855-2, 978-0-07-180925-2,
LCCN = "QA76.73.J38 S332 2014eb",
bibdate = "Thu Dec 4 13:05:57 MST 2014",
bibsource = "fsz3950.oclc.org:210/WorldCat;
abstract = "Fully updated for Java SE 8, this edition explains how
to develop, compile, debug, and run Java programs. The
book covers the entire Java language, including its
syntax, keywords, and fundamental programming
principles, as well as significant portions of the Java
API library. JavaBeans, servlets, applets, and Swing
are examined and real-world examples demonstrate Java
in action. New Java SE 8 features such as lambda
expressions, the stream library, and the default
interface method are discussed in detail. This Oracle
Press resource also offers a solid introduction to
JavaFX. Topics covered include: data types, variables,
arrays, and operators; control statements; classes,
objects, and methods; method overloading and
overriding; inheritance; interfaces and packages;
exception handling; multithreaded programming;
enumerations, autoboxing, and annotations; I/O classes;
generics; lambda expressions; string handlin;
collections framework; networking; event handling; AWT
and Swing; concurrent and stream API; regular
expressions; JavaFX; JavaBeans; and applets and
acknowledgement = ack-nhfb,
shorttableofcontents = "The history and evolution of Java \\
An overview of Java \\
Data types, variables, and arrays \\
Operators \\
Control statements \\
Introducing classes \\
A closer look at methods and classes \\
Inheritance \\
Packages and interfaces \\
Exception handling \\
Multithreaded programming \\
Enumerations, autoboxing, and annotations (metadata)
I/O, applets, and other topics \\
Generics \\
Lambda expressions \\
String handling \\
Exploring java.lang \\
Java.util part 1: the collections framework \\
Java.util part 2: more utility classes \\
Input/output: exploring java.io \\
Exploring NIO \\
Networking \\
The applet class \\
Event handling \\
Introducing the AWT: working with windows, graphics,
and text \\
Using AWT controls, layout managers, and menus \\
Images \\
The concurrency utilities \\
The stream API \\
Regular expressions and other packages \\
Introducing swing \\
Exploring swing \\
Introducing swing menus \\
Introducing JavaFX GUI programming \\
Exploring JavaFX controls \\
Introducing JavaFX menus \\
Java beans \\
Introducing servlets \\
Using Java's documentation comments",
subject = "Java (Langage de programmation); Programmation
Internet; Java (Computer program language); Internet
programming; Internet programming.; Java (Computer
program language)",
tableofcontents = "Part I. The Java language \\
1. The history and evolution of Java: Java's lineage;
The creation of Java; How Java changed the Internet;
Java's magic: the bytecode; Servlets: Java on the
server side; The Java buzzwords; The evolution of Java;
Java SE 8; A culture of innovation \\
2. An overview of Java: Object-oriented programming; A
first simple program; A second short program; Two
control statements; Using blocks of code; Lexical
issues; The Java class libraries \\
3. Data types, variables, and arrays: Java is a
strongly typed language; The primitive types; Integers;
Floating-point types; Characters; Booleans; A closer
look at literals; Variables; Type conversion and
casting; Automatic type promotion in expressions;
Arrays; A few words about strings; A note to C/C++
programmers about pointers \\
4. Operators: Arithmetic operators; The bitwise
operators; Relational operators; Boolean logical
operators; The assignment operator; The ? operator;
Operator precedence; Using parentheses \\
5. Control statements: Java's selection statements;
Iteration statements; Jump statements \\
6. Introducing classes: Class fundamentals; Declaring
objects; Assigning object reference variables;
Introducing methods; Constructors; The this keyword;
Garbage collection; The finalize() method; A stack
class \\
7. A closer look at methods and classes: Overloading
methods; Using objects as parameters; A closer look at
argument passing; Returning objects; Recursion;
Introducing access control; Understanding static;
Introducing final; Arrays revisited; Introducing nested
and inner classes; Exploring the string class; Using
command-line arguments; Varargs: variable-length
arguments \\
8. Inheritance: Inheritance basics; Using super;
Creating a multilevel hierarchy; When constructors are
executed; Method overriding; Dynamic method dispatch;
Using abstract classes; Using final with inheritance;
The object class \\
9. Packages and interfaces: Packages; Access
protection; Importing packages; Interfaces; Default
interface methods; Use static methods in an interface;
Final thoughts on packages and interfaces \\
10. Exception handling: Exception-handling
fundamentals; Exception types; Uncaught exceptions;
Using try and catch; Multiple catch clauses; Nested try
statements; Throw; Throws; Finally; Java's build-in
exceptions; Creating your own exception subclasses;
Chained exceptions; Three recently added exception
features; Using exceptions \\
11. Multithreaded programming: The Java thread model;
The main thread; Creating a thread; Creating multiple
threads; Using isAlive() and join(); Thread priorities;
Synchronization; Interthread communication; Suspending,
resuming, and stopping threads; Obtaining a thread's
state; Using multithreading \\
12. Enumerations, autoboxing, and annotations
(metadata): Enumerations; Type wrappers; Autoboxing;
Annotations (metadata); Type annotations; Repeating
annotations \\
13. I/O, applets, and other topics: I/O basics; Reading
console input; Writing console output; The PrintWriter
class; Reading and writing files; Automatically closing
a file; Applet fundamentals; The transient and volatile
modifiers; Using instanceof; Strictfp; Native methods;
Problems with native methods; Using assert; Static
import; Invoking overloaded constructors through
this(); Compact API profiles \\
14. Generics: What are generics?; A simple generics
example; A generic class with two type parameters; The
general form of a generic class; Bounded types; Using
wildcard arguments; Creating a generic method; Generic
interfaces; Raw types and legacy code; Generic class
hierarchies; Type inference with generics; Erasure;
Ambiguity errors; Some generic restrictions \\
15. Lambda expressions: Introducing lambda expressions;
Block lambda expressions; Generic functional
interfaces; Passing lambda expressions as arguments;
Lambda expressions and exceptions; Lambda expressions
and variable capture; Method references; Constructor
references; Predefined functional interfaces \\
Part II. The Java library. \\
16. String handling: The string constructors; String
length; Special string operations; Character
extraction; String comparison; Searching strings;
Modifying a string; Data conversion using valueOf();
Changing the case of characters within a string;
Joining strings; Additional string methods;
StringBuffer; StringBuilder \\
17. Exploring java.lang: Primitive type wrappers; Void;
Process; Runtime; ProcessBuilder; System; Object; Using
clone() and the cloneable interface; Class;
ClassLoader; Math; StrictMath; Compiler; Thread,
ThreadGroup and runnable; ThreadLocal and
InheritableThreadLocal; Package; RuntimePermission;
Throwable; SecurityManager; StackTraceElement; Enum;
ClassValue; The CharSequence interface; The comparable
interface; The appendable interface; The iterable
interface; The readable interface; The AutoCloseable
interface; The Thread.UncaughtExceptionHandler
interface; The java.lang subpackages \\
18. java.util Part 1: The collections framework:
Collections overview; JDK 5 changed the collections
framework; The collection interfaces; The collection
classes; Accessing a collection via an iterator;
Spliterators; Storing user-defined classes in
collections; The RandomAccess interface; Working with
maps; Comparators; The collection algorithms; Arrays;
The legacy classes and interfaces; Parting thoughts on
collections \\
19. java.util Part 2: More utility classes:
StringTokenizer; BitSet; Optional, OptionalDouble,
OptionalInt, and OptionalLong; Date; Calendar;
GregorianCalendar; TimeZone; SimpleTimeZone; Locale;
Random; Observable; Timer and TimerTask; Currency;
Formatter; Scanner; The ResourceBundle,
ListResourceBundle, and PropertyResourceBundle classes;
Miscellaneous utility classes and interfaces; The
java.util subpackages \\
20. Input/output: exploring java.io: The I/O classes
and interfaces; File; The AutoCloseable, Closeable, and
flushable interfaces; I/O exceptions; Two ways to close
a stream; The stream classes; The byte streams; The
character streams; The console class; Serialization;
Stream benefits \\
21. Exploring NIO: The NIO classes; NIO fundamentals;
Enhancements added to NIO by JDK 7; Using the NIO
system; Pre-JDK 7 channel-based examples \\
22. Networking: Networking basics; The networking
classes and interfaces; Inet/Address; Inet4Address and
Inet6Address; TCP/IP client sockets; URL;
URLConnection; HttpURLConnection; The URI class;
Cookies; TCP/IP server sockets; Datagrams \\
23. The applet class: Two types of applets; Applet
basics; Applet architecture; An applet skeleton; Simple
applet display methods; Requesting repainting; Using
the status window; The HTML APPLET tag; Passing
parameters to applets; getDocumentBase() and
getCodeBase(); AppletContext and showDocument(); The
AudioClip interface; The AppletStub interface;
Outputting to the console \\
24. Event handling: Two event handling mechanisms; The
delegation event model; Event classes; The KeyEvent
class; Sources of events; Event listener interfaces;
Using the delegation event model; Adapter classes;
Inner classes \\
25. Introducing the AWT: working with windows,
graphics, and text: AWT classes; Window fundamentals;
Working with frame windows; Creating a frame window in
an AWT-based applet; Creating a windowed program;
Displaying information within a window; Introducing
graphics; Working with color; Setting the paint mode;
Working with fonts; Managing text output using
FontMetrics \\
26. Using AWT controls, layout managers, and menus: AWT
control fundamentals; Labels; Using buttons; Applying
check boxes; CheckboxGroup; Choice controls; Using
lists; Managing scroll bars; Using a TextField; Using a
TextArea; Understanding layout managers; Menu bars and
menus; Dialog boxes; FileDialog; A word about
overriding paint() \\
27. Images: File formats; Image fundamentals: creating,
loading, and displaying; ImageObserver; Double
buffering; MediaTracker; ImageProducer; ImageConsumer;
ImageFilter; Additional imaging classes \\
28. The concurrency utilities: The concurrent API
packages; Using synchronization objects; Phaser; Using
an executor; The TimeUnit enumeration; the concurrent
collections; Locks; Atomic operations; Parallel
programming via the fork/join framework; The
concurrency utilities versus Java's traditional
approach \\
29. The stream API: Stream basics; Reduction
operations; Using parallel streams; Mapping;
Collecting; Iterators and streams; More to explore in
the stream API \\
30. Regular expressions and other packages: The core
Java API packages; Regular expression processing;
Reflection; Remote method invocation (RMI); Formatting
date and time with java.text; The time and date API
added by JDK 8 \\
Part III. Introducing GUI programming with swing \\
31. Introducing swing: The origins of swing; Swing is
built on the AWT; Two key swing features; The MVC
connection; Components and containers; The swing
packages; A simple swing application; Event handling;
Create a swing applet; Painting in swing \\
32. Exploring swing: JLabel and ImageIcon; JTextField;
The swing buttons; JTabbedPane; JScrollPane; JList;
JComboBox; Trees; JTable \\
33. Introducing swing menus: Menu basics; An overview
of JMenuBar, JMenu, and JMenuItem; Create a main menu;
Add Mnemonics and accelerators to menu items; Add
images and tooltips to menu items; Use
JRadioButtonMenuItem and JCheckBoxMenuItem; Create a
popup menu; Create a toolbar; Use actions; Put the
entire MenuDemo program together; Continuing your
exploration of swing \\
Part IV. Introducing GUI programming with JavaFX \\
34. Introducing JavaFX GUI programming: JavaFX basic
concepts; A JavaFX application skeleton; Compiling and
running a JavaFX program; The application thread; A
simple JavaFX control: label; Using buttons and events;
Drawing directly on a canvas \\
35. Exploring JavaFX controls: Using image and
ImageView; ToggleButton; RadioButton; CheckBox;
ListView; ComboBox; TextField; ScrollPane; TreeView;
Introducing effects and transforms; Adding tooltips;
Disabling a control \\
36. Introducing JavaFX menus: Menu basics; An overview
of MenuBar, Menu, and MenuItem; Create a main menu; Add
mnemonics and accelerators to menu items; Add images to
menu items; Use RadioMenuItem and CheckMenuItem; Create
a context menu; Create a toolbar; Put the entire
MenuDemo program together; Continuing your exploration
of JavaFX \\
Part V. Applying Java \\
37. Java beans: What is a Java bean?; Advantages of
Java beans; Introspection; Bound and constrained
properties; Persistence; Customizers; The Java beans
API; A bean example \\
38. Introducing servlets: Background; The life cycle of
a servlet; Servlet development options; Using Tomcat; A
simple servlet; The servlet API; The javax.servlet
package; Reading servlet parameters; The
javax.servlet.http package; Handling HTTP requests and
responses; Using cookies; Session tracking \\
Appendix. Using Java's documentation comments: The
javadoc tags; The general form of a documentation
comment; What javadoc outputs; An example that uses
documentation comments",
author = "Wen-Li Shih and Yi-Ping You and Chung-Wen Huang and
Jenq Kuen Lee",
title = "Compiler Optimization for Reducing Leakage Power in
Multithread {BSP} Programs",
journal = j-TODAES,
volume = "20",
number = "1",
pages = "9:1--9:??",
month = nov,
year = "2014",
DOI = "https://doi.org/10.1145/2668119",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Wed Nov 19 11:18:40 MST 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "Multithread programming is widely adopted in novel
embedded system applications due to its high
performance and flexibility. This article addresses
compiler optimization for reducing the power
consumption of multithread programs. A traditional
compiler employs energy management techniques that
analyze component usage in control-flow graphs with a
focus on single-thread programs. In this environment
the leakage power can be controlled by inserting on and
off instructions based on component usage information
generated by flow equations. However, these methods
cannot be directly extended to a multithread
environment due to concurrent execution issues. This
article presents a multithread power-gating framework
composed of multithread power-gating analysis (MTPGA)
and predicated power-gating (PPG) energy management
mechanisms for reducing the leakage power when
executing multithread programs on simultaneous
multithreading (SMT) machines. Our multithread
programming model is based on hierarchical
bulk-synchronous parallel (BSP) models. Based on a
multithread component analysis with dataflow equations,
our MTPGA framework estimates the energy usage of
multithread programs and inserts PPG operations as
power controls for energy management. We performed
experiments by incorporating our power optimization
framework into SUIF compiler tools and by simulating
the energy consumption with a post-estimated SMT
simulator based on Wattch toolkits. The experimental
results show that the total energy consumption of a
system with PPG support and our power optimization
method is reduced by an average of 10.09\% for BSP
programs relative to a system without a power-gating
mechanism on leakage contribution set to 30\%; and the
total energy consumption is reduced by an average of
4.27\% on leakage contribution set to 10\%. The results
demonstrate our mechanisms are effective in reducing
the leakage energy of BSP multithread programs.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Srinath Sridharan and Gagan Gupta and Gurindar S.
title = "Adaptive, efficient, parallel execution of parallel
journal = j-SIGPLAN,
volume = "49",
number = "6",
pages = "169--180",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2666356.2594292",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Sep 26 07:38:28 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Future multicore processors will be heterogeneous, be
increasingly less reliable, and operate in dynamically
changing operating conditions. Such environments will
result in a constantly varying pool of hardware
resources which can greatly complicate the task of
efficiently exposing a program's parallelism onto these
resources. Coupled with this uncertainty is the diverse
set of efficiency metrics that users may desire. This
paper proposes Varuna, a system that dynamically,
continuously, rapidly and transparently adapts a
program's parallelism to best match the instantaneous
capabilities of the hardware resources while satisfying
different efficiency metrics. Varuna is applicable to
both multithreaded and task-based programs and can be
seamlessly inserted between the program and the
operating system without needing to change the source
code of either. We demonstrate Varuna's effectiveness
in diverse execution environments using unaltered C/C++
parallel programs from various benchmark suites.
Regardless of the execution environment, Varuna always
outperformed the state-of-the-art approaches for the
efficiency metrics considered.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
received = "PLDI '14 conference proceedings.",
author = "Guy L. {Steele, Jr.} and Doug Lea and Christine H.
title = "Fast splittable pseudorandom number generators",
journal = j-SIGPLAN,
volume = "49",
number = "10",
pages = "453--472",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2714064.2660195",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:21 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/m/marsaglia-george.bib;
abstract = "We describe a new algorithm SplitMix for an
object-oriented and splittable pseudorandom number
generator (PRNG) that is quite fast: 9 64-bit
arithmetic/logical operations per 64 bits generated. A
conventional linear PRNG object provides a generate
method that returns one pseudorandom value and updates
the state of the PRNG, but a splittable PRNG object
also has a second operation, split, that replaces the
original PRNG object with two (seemingly) independent
PRNG objects, by creating and returning a new such
object and updating the state of the original object.
Splittable PRNG objects make it easy to organize the
use of pseudorandom numbers in multithreaded programs
structured using fork-join parallelism. No locking or
synchronization is required (other than the usual
memory fence immediately after object creation).
Because the generate method has no loops or
conditionals, it is suitable for SIMD or GPU
implementation. We derive SplitMix from the DotMix
algorithm of Leiserson, Schardl, and Sukha by making a
series of program transformations and engineering
improvements. The end result is an object-oriented
version of the purely functional API used in the
Haskell library for over a decade, but SplitMix is
faster and produces pseudorandom sequences of higher
quality; it is also far superior in quality and speed
to java.util.Random, and has been included in Java JDK8
as the class java.util.SplittableRandom. We have tested
the pseudorandom sequences produced by SplitMix using
two standard statistical test suites (DieHarder and
TestU01) and they appear to be adequate for
``everyday'' use, such as in Monte Carlo algorithms and
randomized data structures where speed is important.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark-1 = "OOPSLA '14 conference proceedings.",
remark-2 = "On page 466, the authors describe an interesting
technique for improving a user-supplied seed that might
produce insufficient randomness in the next several
members of the random-number sequence: ``Long runs of
0-bits or of 1-bits in the $\gamma$ [candidate seed]
value do not cause bits of the seed to flip; an
approximate proxy for how many bits of the seed will
flip might be the number of bit pairs of the form 01 or
10 in the candidate $\gamma$ value {\tt z}. Therefore
we require that the number of such pairs, as computed
by {\tt Long.bitCount(z ^ (z >>> 1))}, exceed 24; if it
does not, then the candidate z is replaced by the XOR
of {\tt z} and {\tt 0xaaaaaaaaaaaaaaaaL}, a constant
chosen so that (a) the low bit of {\tt z} remains 1,
and (b) every bit pair of the form 00 or 11 becomes
either 01 or 10, and likewise every bit pair of the
form 01 or 10 becomes either 00 or 11, so the new value
necessarily has more than 24 bit pairs whose bits
differ. Testing shows that this trick appears to be
remark-3 = "From page 468: ``we did three runs of TestU01 BigCrush
on {\tt java.util.Random}; 19 tests produced clear
failure on all three runs. These included 9 Birthday
Spacings tests, 8 ClosePairs tests, a WeightDistrib
test, and a CouponCollector test. This confirms
L'Ecuyer's observation that {\tt java.util.Random}
tends to fail Birthday Spacings tests [17].'' The
reference is to \cite{LEcuyer:2001:SUR}.",
remark-4 = "From page 470: ``[L'Ecuyer] comments, `In the Java
class {\tt java.util.Random}, RNG streams can be
declared and constructed dynamically, without limit on
their number. However, no precaution seems to have been
taken regarding the independence of these streams.'''",
remark-5 = "From page 471: ``They [the generators in this paper]
should not be used for cryptographic or security
applications, because they are too predictable (the
mixing functions are easily inverted, and two
successive outputs suffice to reconstruct the internal
state), \ldots{} One version seems especially suitable
for use as a replacement for {\tt java.util.Random},
because it produces sequences of higher quality, is
faster in sequential use, is easily parallelized for
use in JDK8 stream expressions, and is amenable to
efficient implementation on SIMD and GPU
author = "I-Jui Sung and Juan G{\'o}mez-Luna and Jos{\'e}
Mar{\'\i}a Gonz{\'a}lez-Linares and Nicol{\'a}s Guil
and Wen-Mei W. Hwu",
title = "In-place transposition of rectangular matrices on
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "207--218",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555266",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Matrix transposition is an important algorithmic
building block for many numeric algorithms such as FFT.
It has also been used to convert the storage layout of
arrays. With more and more algebra libraries offloaded
to GPUs, a high performance in-place transposition
becomes necessary. Intuitively, in-place transposition
should be a good fit for GPU architectures due to
limited available on-board memory capacity and high
throughput. However, direct application of CPU in-place
transposition algorithms lacks the amount of
parallelism and locality required by GPUs to achieve
good performance. In this paper we present the first
known in-place matrix transposition approach for the
GPUs. Our implementation is based on a novel 3-stage
transposition algorithm where each stage is performed
using an elementary tiled-wise transposition.
Additionally, when transposition is done as part of the
memory transfer between GPU and host, our staged
approach allows hiding transposition overhead by
overlap with PCIe transfer. We show that the 3-stage
algorithm allows larger tiles and achieves 3X speedup
over a traditional 4-stage algorithm, with both
algorithms based on our high-performance elementary
transpositions on the GPU. We also show our proposed
low-level optimizations improve the sustained
throughput to more than 20 GB/s. Finally, we propose an
asynchronous execution scheme that allows CPU threads
to delegate in-place matrix transposition to GPU,
achieving a throughput of more than 3.4 GB/s (including
data transfers costs), and improving current
multithreaded implementations of in-place transposition
on CPU.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Alexander Tarvo and Steven P. Reiss",
title = "Automated analysis of multithreaded programs for
performance modeling",
journal = j-SIGMETRICS,
volume = "42",
number = "1",
pages = "557--558",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637364.2592016",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Jun 27 06:38:48 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present an approach for building performance models
of multithreaded programs automatically. We use a
combination of static and a dynamic analyses of a
single representative run of the program to build its
model. The model can predict performance of the program
under a variety of configurations. This paper outlines
how we construct the model and demonstrates how the
resultant models accurately predict the performance and
resource utilization of complex multithreaded
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Aaron Turon and Viktor Vafeiadis and Derek Dreyer",
title = "{GPS}: navigating weak memory with ghosts, protocols,
and separation",
journal = j-SIGPLAN,
volume = "49",
number = "10",
pages = "691--707",
month = oct,
year = "2014",
DOI = "https://doi.org/10.1145/2714064.2660243",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:21 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Weak memory models formalize the inconsistent
behaviors that one can expect to observe in
multithreaded programs running on modern hardware. In
so doing, however, they complicate the
already-difficult task of reasoning about correctness
of concurrent code. Worse, they render impotent the
sophisticated formal methods that have been developed
to tame concurrency, which almost universally assume a
strong ( i.e. sequentially consistent) memory model.
This paper introduces GPS, the first program logic to
provide a full-fledged suite of modern verification
techniques --- including ghost state, protocols, and
separation logic --- for high-level, structured
reasoning about weak memory. We demonstrate the
effectiveness of GPS by applying it to challenging
examples drawn from the Linux kernel as well as
lock-free data structures. We also define the semantics
of GPS and prove in Coq that it is sound with respect
to the axiomatic C11 weak memory model.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '14 conference proceedings.",
author = "Jack Wadden and Alexander Lyashevsky and Sudhanva
Gurumurthi and Vilas Sridharan and Kevin Skadron",
title = "Real-world design and evaluation of compiler-managed
{GPU} redundant multithreading",
journal = j-COMP-ARCH-NEWS,
volume = "42",
number = "3",
pages = "73--84",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2678373.2665686",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Wed Dec 3 16:18:50 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Reliability for general purpose processing on the GPU
(GPGPU) is becoming a weak link in the construction of
reliable supercomputer systems. Because hardware
protection is expensive to develop, requires dedicated
on-chip resources, and is not portable across different
architectures, the efficiency of software solutions
such as redundant multithreading (RMT) must be
explored. This paper presents a real-world design and
evaluation of automatic software RMT on GPU hardware.
We first describe a compiler pass that automatically
converts GPGPU kernels into redundantly threaded
versions. We then perform detailed power and
performance evaluations of three RMT algorithms, each
of which provides fault coverage to a set of structures
in the GPU. Using real hardware, we show that
compiler-managed software RMT has highly variable
costs. We further analyze the individual costs of
redundant work scheduling, redundant computation, and
inter-thread communication, showing that no single
component in general is responsible for high overheads
across all applications; instead, certain workload
properties tend to cause RMT to perform well or poorly.
Finally, we demonstrate the benefit of architectural
support for RMT with a specific example of fast,
register-level thread communication",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ISCA '14 conference proceedings.",
author = "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
Li and Depei Qian",
title = "Software Transactional Memory for {GPU}
volume = "13",
number = "1",
pages = "49--52",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "To make applications with dynamic data sharing among
threads benefit from GPU acceleration, we propose a
novel software transactional memory system for GPU
architectures (GPU-STM). The major challenges include
ensuring good scalability with respect to the massively
multithreading of GPUs, and preventing livelocks caused
by the SIMT execution paradigm of GPUs. To this end, we
propose (1) a hierarchical validation technique and (2)
an encounter-time lock-sorting mechanism to deal with
the two challenges, respectively. Evaluation shows that
GPU-STM outperforms coarse-grain locks on GPUs by up to
acknowledgement = ack-nhfb,
affiliation = "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF of China [61133004, 61128004,
61073011]; 863 Program of China [2012AA010902]",
funding-text = "This work is supported by NSF of China under grant
61133004, 61128004 and 61073011, and 863 Program of
China under grant 2012AA010902.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Multicore Processors; Parallel Programming; Run-time
Environments; SIMD Processors",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Xu:2014:STM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Yi Yang and Huiyang Zhou",
title = "{CUDA-NP}: realizing nested thread-level parallelism
in {GPGPU} applications",
journal = j-SIGPLAN,
volume = "49",
number = "8",
pages = "93--106",
month = aug,
year = "2014",
DOI = "https://doi.org/10.1145/2692916.2555254",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Nov 26 16:26:30 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Parallel programs consist of series of code sections
with different thread-level parallelism (TLP). As a
result, it is rather common that a thread in a parallel
program, such as a GPU kernel in CUDA programs, still
contains both sequential code and parallel loops. In
order to leverage such parallel loops, the latest
Nvidia Kepler architecture introduces dynamic
parallelism, which allows a GPU thread to start another
GPU kernel, thereby reducing the overhead of launching
kernels from a CPU. However, with dynamic parallelism,
a parent thread can only communicate with its child
threads through global memory and the overhead of
launching GPU kernels is non-trivial even within GPUs.
In this paper, we first study a set of GPGPU benchmarks
that contain parallel loops, and highlight that these
bench-marks do not have a very high loop count or high
degrees of TLP. Consequently, the benefits of
leveraging such parallel loops using dynamic
parallelism are too limited to offset its overhead. We
then present our proposed solution to exploit nested
parallelism in CUDA, referred to as CUDA-NP. With
CUDA-NP, we initially enable a high number of threads
when a GPU program starts, and use control flow to
activate different numbers of threads for different
code sections. We implemented our proposed CUDA-NP
framework using a directive-based compiler approach.
For a GPU kernel, an application developer only needs
to add OpenMP-like pragmas for parallelizable code
sections. Then, our CUDA-NP compiler automatically
generates the optimized GPU kernels. It supports both
the reduction and the scan primitives, explores
different ways to distribute parallel loop iterations
into threads, and efficiently manages on-chip resource.
Our experiments show that for a set of GPGPU
benchmarks, which have already been optimized and
contain nested parallelism, our pro-posed CUDA-NP
framework further improves the performance by up to
6.69 times and 2.18 times on average.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '14 conference proceedings.",
author = "Junfeng Yang and Heming Cui and Jingyue Wu and Yang
Tang and Gang Hu",
title = "Making parallel programs reliable with stable
journal = j-CACM,
volume = "57",
number = "3",
pages = "58--69",
month = mar,
year = "2014",
DOI = "https://doi.org/10.1145/2500875",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Thu Feb 27 17:17:45 MST 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/cacm/;
abstract = "Stable multithreading dramatically simplifies the
interleaving behaviors of parallel programs, offering
new hope for making parallel programming easier.",
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "Jos{\'e} I. Aliaga and Jos{\'e} M. Bad{\'\i}a and
Maribel Castillo and Davor Davidovi{\'c} and Rafael
Mayo and Enrique S. Quintana-Ort{\'\i}",
title = "Out-of-core macromolecular simulations on
multithreaded architectures",
journal = j-CCPE,
volume = "27",
number = "6",
pages = "1540--1550",
day = "25",
month = apr,
year = "2015",
DOI = "https://doi.org/10.1002/cpe.3357",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat Jul 25 19:54:07 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "31 Aug 2014",
author = "Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel
Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n
Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S.
title = "Unveiling the performance-energy trade-off in
iterative linear system solvers for multithreaded
journal = j-CCPE,
volume = "27",
number = "4",
pages = "885--904",
day = "25",
month = mar,
year = "2015",
DOI = "https://doi.org/10.1002/cpe.3341",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat Jul 25 19:54:06 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "9 Sep 2014",
author = "Abdelhalim Amer and Huiwei Lu and Yanjie Wei and Pavan
Balaji and Satoshi Matsuoka",
title = "{MPI+Threads}: runtime contention and remedies",
journal = j-SIGPLAN,
volume = "50",
number = "8",
pages = "239--248",
month = aug,
year = "2015",
DOI = "https://doi.org/10.1145/2858788.2688522",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:42 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Hybrid MPI+Threads programming has emerged as an
alternative model to the ``MPI everywhere'' model to
better handle the increasing core density in cluster
nodes. While the MPI standard allows multithreaded
concurrent communication, such flexibility comes with
the cost of maintaining thread safety within the MPI
implementation, typically implemented using critical
sections. In contrast to previous works that studied
the importance of critical-section granularity in MPI
implementations, in this paper we investigate the
implication of critical-section arbitration on
communication performance. We first analyze the MPI
runtime when multithreaded concurrent communication
takes place on hierarchical memory systems. Our results
indicate that the mutex-based approach that most MPI
implementations use today can incur performance
penalties due to unfair arbitration. We then present
methods to mitigate these penalties with a first-come,
first-served arbitration and a priority locking scheme
that favors threads doing useful work. Through
evaluations using several benchmarks and applications,
we demonstrate up to 5-fold improvement in
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '15 conference proceedings.",
author = "C. Axnix and G. Bayer and H. Bohm and J. von Buttlar
and M. S. Farrell and L. C. Heller and J. P. Kubala and
S. E. Lederer and R. Mansell and A. Nunez Mencias and
S. Usenbinz",
title = "{IBM z13} firmware innovations for simultaneous
multithreading and {I/O} virtualization",
journal = j-IBM-JRD,
volume = "59",
number = "??",
pages = "11:1--11:11",
month = "????",
year = "2015",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Wed Oct 21 11:38:12 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
acknowledgement = ack-nhfb,
author = "Xiuxiu Bai and Endong Wang and Xiaoshe Dong and
Xingjun Zhang",
title = "A scalability prediction approach for multi-threaded
applications on manycore processors",
volume = "71",
number = "11",
pages = "4072--4094",
month = nov,
year = "2015",
DOI = "https://doi.org/10.1007/s11227-015-1505-x",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Mon Jan 25 08:18:10 MST 2016",
bibsource = "http://link.springer.com/journal/11227/71/11;
URL = "http://link.springer.com/article/10.1007/s11227-015-1505-x",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and
Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues",
title = "{iThreads}: a Threading Library for Parallel
Incremental Computation",
journal = j-SIGPLAN,
volume = "50",
number = "4",
pages = "645--659",
month = apr,
year = "2015",
DOI = "https://doi.org/10.1145/2775054.2694371",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:19 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Incremental computation strives for efficient
successive runs of applications by re-executing only
those parts of the computation that are affected by a
given input change instead of recomputing everything
from scratch. To realize these benefits automatically,
we describe iThreads, a threading library for parallel
incremental computation. iThreads supports unmodified
shared-memory multithreaded programs: it can be used as
a replacement for pthreads by a simple exchange of
dynamically linked libraries, without even recompiling
the application code. To enable such an interface, we
designed algorithms and an implementation to operate at
the compiled binary code level by leveraging
MMU-assisted memory access tracking and process-based
thread isolation. Our evaluation on a multicore
platform using applications from the PARSEC and Phoenix
benchmarks and two case-studies shows significant
performance gains.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '15 conference proceedings.",
author = "Denis Bogdanas and Grigore Rosu",
title = "{K-Java}: a Complete Semantics of {Java}",
journal = j-SIGPLAN,
volume = "50",
number = "1",
pages = "445--456",
month = jan,
year = "2015",
DOI = "https://doi.org/10.1145/2775051.2676982",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:19 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This paper presents K-Java, a complete executable
formal semantics of Java 1.4. K-Java was extensively
tested with a test suite developed alongside the
project, following the Test Driven Development
methodology. In order to maintain clarity while
handling the great size of Java, the semantics was
split into two separate definitions --- a static
semantics and a dynamic semantics. The output of the
static semantics is a preprocessed Java program, which
is passed as input to the dynamic semantics for
execution. The preprocessed program is a valid Java
program, which uses a subset of the features of Java.
The semantics is applied to model-check multi-threaded
programs. Both the test suite and the static semantics
are generic and ready to be used in other Java-related
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '15 conference proceedings.",
author = "Yan Cai and Changjiang Jia and Shangru Wu and Ke Zhai
and Wing Kwong Chan",
title = "{ASN}: A Dynamic Barrier-Based Approach to
Confirmation of Deadlocks from Warnings for Large-Scale
Multithreaded Programs",
volume = "26",
number = "1",
pages = "13--23",
month = jan,
year = "2015",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Feb 12 13:58:35 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
URL = "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html",
abstract-URL = "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html",
acknowledgement = ack-nhfb,
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Adam Chlipala",
title = "From Network Interface to Multithreaded {Web}
Applications: a Case Study in Modular Program
journal = j-SIGPLAN,
volume = "50",
number = "1",
pages = "609--622",
month = jan,
year = "2015",
DOI = "https://doi.org/10.1145/2775051.2677003",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:19 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Many verifications of realistic software systems are
monolithic, in the sense that they define single global
invariants over complete system state. More modular
proof techniques promise to support reuse of component
proofs and even reduce the effort required to verify
one concrete system, just as modularity simplifies
standard software development. This paper reports on
one case study applying modular proof techniques in the
Coq proof assistant. To our knowledge, it is the first
modular verification certifying a system that combines
infrastructure with an application of interest to end
users. We assume a nonblocking API for managing TCP
networking streams, and on top of that we work our way
up to certifying multithreaded, database-backed Web
applications. Key verified components include a
cooperative threading library and an implementation of
a domain-specific language for XML processing. We have
deployed our case-study system on mobile robots, where
it interfaces with off-the-shelf components for
sensing, actuation, and control.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '15 conference proceedings.",
author = "Adam Chlipala",
title = "{Ur\slash Web}: a Simple Model for Programming the
journal = j-SIGPLAN,
volume = "50",
number = "1",
pages = "153--165",
month = jan,
year = "2015",
DOI = "https://doi.org/10.1145/2775051.2677004",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:19 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The World Wide Web has evolved gradually from a
document delivery platform to an architecture for
distributed programming. This largely unplanned
evolution is apparent in the set of interconnected
languages and protocols that any Web application must
manage. This paper presents Ur/Web, a domain-specific,
statically typed functional programming language with a
much simpler model for programming modern Web
applications. Ur/Web's model is unified, where programs
in a single programming language are compiled to other
``Web standards'' languages as needed; supports novel
kinds of encapsulation of Web-specific state; and
exposes simple concurrency, where programmers can
reason about distributed, multithreaded applications
via a mix of transactions and cooperative preemption.
We give a tutorial introduction to the main features of
Ur/Web and discuss the language implementation and the
production Web applications that use it.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '15 conference proceedings.",
author = "B. W. Curran and C. Jacobi and J. J. Bonanno and D. A.
Schroter and K. J. Alexander and A. Puranik and M. M.
title = "The {IBM z13} multithreaded microprocessor",
journal = j-IBM-JRD,
volume = "59",
number = "4--5",
pages = "1:1--1:13",
month = jul # "\slash " # sep,
year = "2015",
ISSN = "0018-8646 (print), 2151-8556 (electronic)",
ISSN-L = "0018-8646",
bibdate = "Wed Oct 21 11:38:12 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib;
acknowledgement = ack-nhfb,
fjournal = "IBM Journal of Research and Development",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520",
author = "Madan Das and Gabriel Southern and Jose Renau",
title = "Section-Based Program Analysis to Reduce Overhead of
Detecting Unsynchronized Thread Communication",
journal = j-TACO,
volume = "12",
number = "2",
pages = "23:1--23:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2766451",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Aug 7 09:46:00 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Most systems that test and verify parallel programs,
such as deterministic execution engines, data race
detectors, and software transactional memory systems,
require instrumenting loads and stores in an
application. This can cause a very significant runtime
and memory overhead compared to executing
uninstrumented code. Multithreaded programming
typically allows any thread to perform loads and stores
to any location in the process's address space
independently, and such tools monitor all these memory
accesses. However, many of the addresses in these
unsynchronized memory accesses are only used by a
single thread and do not affect other executing
threads. We propose Section-Based Program Analysis
(SBPA), a novel way to decompose the program into
disjoint code sections to identify and eliminate
instrumenting such loads and stores during program
compilation so that the program runtime overhead is
significantly reduced. Our analysis includes
improvements to pointer analysis and uses a few user
directives to increase the effectiveness of SBPA
further. We implemented SBPA for a deterministic
execution runtime environment and were able to
eliminate 51\% of dynamic memory access
instrumentations. When combined with directives, such
reduction increased to 63\%. We also integrated SBPA
with ThreadSanitizer, a state-of-the-art dynamic race
detector, and achieved a speedup of 2.43 (2.74 with
directives) on a geometric mean basis.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Wei Ding and Xulong Tang and Mahmut Kandemir and
Yuanrui Zhang and Emre Kultursay",
title = "Optimizing off-chip accesses in multicores",
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "131--142",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737989",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In a network-on-chip (NoC) based manycore
architecture, an off-chip data access (main memory
access) needs to travel through the on-chip network,
spending considerable amount of time within the chip
(in addition to the memory access latency). In
addition, it contends with on-chip (cache) accesses as
both use the same NoC resources. In this paper,
focusing on data-parallel, multithreaded applications,
we propose a compiler-based off-chip data access
localization strategy, which places data elements in
the memory space such that an off-chip access traverses
a minimum number of links (hops) to reach the memory
controller that handles this access. This brings three
main benefits. First, the network latency of off-chip
accesses gets reduced; second, the network latency of
on-chip accesses gets reduced; and finally, the memory
latency of off-chip accesses improves, due to reduced
queue latencies. We present an experimental evaluation
of our optimization strategy using a set of 13
multithreaded application programs under both private
and shared last-level caches. The results collected
emphasize the importance of optimizing the off-chip
data accesses.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and
Antonia Zhai and James Greensky and Gautham Beeraka and
Binyu Zang",
title = "Measuring Microarchitectural Details of Multi- and
Many-Core Memory Systems through Microbenchmarking",
journal = j-TACO,
volume = "11",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687356",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "As multicore and many-core architectures evolve, their
memory systems are becoming increasingly more complex.
To bridge the latency and bandwidth gap between the
processor and memory, they often use a mix of
multilevel private/shared caches that are either
blocking or nonblocking and are connected by high-speed
network-on-chip. Moreover, they also incorporate
hardware and software prefetching and simultaneous
multithreading (SMT) to hide memory latency. On such
multi- and many-core systems, to incorporate various
memory optimization schemes using compiler
optimizations and performance tuning techniques, it is
crucial to have microarchitectural details of the
target memory system. Unfortunately, such details are
often unavailable from vendors, especially for newly
released processors. In this article, we propose a
novel microbenchmarking methodology based on short
elapsed-time events (SETEs) to obtain comprehensive
memory microarchitectural details in multi- and
many-core processors. This approach requires detailed
analysis of potential interfering factors that could
affect the intended behavior of such memory systems. We
lay out effective guidelines to control and mitigate
those interfering factors. Taking the impact of SMT
into consideration, our proposed methodology not only
can measure traditional cache/memory latency and
off-chip bandwidth but also can uncover the details of
software and hardware prefetching units not attempted
in previous studies. Using the newly released Intel
Xeon Phi many-core processor (with in-order cores) as
an example, we show how we can use a set of
microbenchmarks to determine various microarchitectural
features of its memory system (many are undocumented
from vendors). To demonstrate the portability and
validate the correctness of such a methodology, we use
the well-documented Intel Sandy Bridge multicore
processor (with out-of-order cores) as another example,
where most data are available and can be validated.
Moreover, to illustrate the usefulness of the measured
data, we do a multistage coordinated data prefetching
case study on both Xeon Phi and Sandy Bridge and show
that by using the measured data, we can achieve 1.3X
and 1.08X performance speedup, respectively, compared
to the state-of-the-art Intel ICC compiler. We believe
that these measurements also provide useful insights
into memory optimization, analysis, and modeling of
such multicore and many-core architectures.",
acknowledgement = ack-nhfb,
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Azadeh Farzan and Zachary Kincaid and Andreas
title = "Proof Spaces for Unbounded Parallelism",
journal = j-SIGPLAN,
volume = "50",
number = "1",
pages = "407--420",
month = jan,
year = "2015",
DOI = "https://doi.org/10.1145/2775051.2677012",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue May 12 17:41:19 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In this paper, we present a new approach to
automatically verify multi-threaded programs which are
executed by an unbounded number of threads running in
parallel. The starting point for our work is the
problem of how we can leverage existing automated
verification technology for sequential programs
(abstract interpretation, Craig interpolation,
constraint solving, etc.) for multi-threaded programs.
Suppose that we are given a correctness proof for a
trace of a program (or for some other program
fragment). We observe that the proof can always be
decomposed into a finite set of Hoare triples, and we
ask what can be proved from the finite set of Hoare
triples using only simple combinatorial inference rules
(without access to a theorem prover and without the
possibility to infer genuinely new Hoare triples)? We
introduce a proof system where one proves the
correctness of a multi-threaded program by showing that
for each trace of the program, there exists a
correctness proof in the space of proofs that are
derivable from a finite set of axioms using simple
combinatorial inference rules. This proof system is
complete with respect to the classical proof method of
establishing an inductive invariant (which uses thread
quantification and control predicates). Moreover, it is
possible to algorithmically check whether a given set
of axioms is sufficient to prove the correctness of a
multi-threaded program, using ideas from
well-structured transition systems.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '15 conference proceedings.",
author = "Ammlan Ghosh and Rituparna Chaki and Nabendu Chaki",
title = "A new concurrency control mechanism for multi-threaded
environment using transactional memory",
volume = "71",
number = "11",
pages = "4095--4115",
month = nov,
year = "2015",
DOI = "https://doi.org/10.1007/s11227-015-1507-8",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Mon Jan 25 08:18:10 MST 2016",
bibsource = "http://link.springer.com/journal/11227/71/11;
URL = "http://link.springer.com/article/10.1007/s11227-015-1507-8;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Mahantesh Halappanavar and Alex Pothen and Ariful Azad
and Fredrik Manne and Johannes Langguth and Arif Khan",
title = "Codesign Lessons Learned from Implementing Graph
Matching on Multithreaded Architectures",
journal = j-COMPUTER,
volume = "48",
number = "8",
pages = "46--55",
month = aug,
year = "2015",
DOI = "https://doi.org/10.1109/MC.2015.215",
ISSN = "0018-9162 (print), 1558-0814 (electronic)",
ISSN-L = "0018-9162",
bibdate = "Tue Nov 3 07:04:37 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib;
URL = "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html",
abstract-URL = "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html",
acknowledgement = ack-nhfb,
journal-URL = "http://www.computer.org/portal/web/csdl/magazines/computer",
author = "Thibaud Hottelier and Rastislav Bodik",
title = "Synthesis of layout engines from relational
journal = j-SIGPLAN,
volume = "50",
number = "10",
pages = "74--88",
month = oct,
year = "2015",
DOI = "https://doi.org/10.1145/2858965.2814291",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:43 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present an algorithm for synthesizing efficient
document layout engines from compact relational
specifications. These specifications are compact in
that a single specification can produce multiple
engines, each for a distinct layout situation, i.e., a
different combination of known vs. unknown attributes.
Technically, our specifications are relational
attribute grammars, while our engines are functional
attribute grammars. By synthesizing functions from
relational constraints, we obviate the need for
constraint solving at runtime, because functional
attribute grammars can be easily evaluated according to
a fixed schedule, sidestepping the backtracking search
performed by constraint solvers. Our experiments show
that we can generate layout engines for non-trivial
data visualizations, and that our synthesized engines
are between 39- and 200-times faster than
general-purpose constraint solvers. Relational
specifications of layout give rise to synthesis
problems that have previously proved intractable. Our
algorithm exploits the hierarchical, grammar-based
structure of the specification, decomposing the
specification into smaller subproblems, which can be
tackled with off-the-shelf synthesis procedures. The
new synthesis problem then becomes the composition of
the functions thus generated into a correct attribute
grammar, which might be recursive. We show how to solve
this problem by efficient reduction to an SMT
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '15 conference proceedings.",
author = "Kai Huang and Min Yu and Rongjie Yan and Xiaomeng
Zhang and Xiaolang Yan and Lisane Brisolara and Ahmed
Amine Jerraya and Jiong Feng",
title = "Communication Optimizations for Multithreaded Code
Generation from {Simulink} Models",
journal = j-TECS,
volume = "14",
number = "3",
pages = "59:1--59:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2644811",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Sat Dec 9 08:08:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Communication frequency is increasing with the growing
complexity of emerging embedded applications and the
number of processors in the implemented multiprocessor
SoC architectures. In this article, we consider the
issue of communication cost reduction during
multithreaded code generation from partitioned Simulink
models to help designers in code optimization to
improve system performance. We first propose a
technique combining message aggregation and
communication pipeline methods, which groups
communications with the same destinations and sources
and parallelizes communication and computation tasks.
We also present a method to apply static analysis and
dynamic emulation for efficient communication buffer
allocation to further reduce synchronization cost and
increase processor utilization. The existing cyclic
dependency in the mapped model may hinder the
effectiveness of the two techniques. We further propose
a set of optimizations involving repartition with
strongly connected threads to maximize the degree of
communication reduction and preprocessing strategies
with available delays in the model to reduce the number
of communication channels that cannot be optimized.
Experimental results demonstrate the advantages of the
proposed optimizations with 11--143\% throughput
acknowledgement = ack-nhfb,
articleno = "59",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Ahmed Hussein and Antony L. Hosking and Mathias Payer
and Christopher A. Vick",
title = "Don't race the memory bus: taming the {GC} leadfoot",
journal = j-SIGPLAN,
volume = "50",
number = "11",
pages = "15--27",
month = nov,
year = "2015",
DOI = "https://doi.org/10.1145/2887746.2754182",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:44 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Dynamic voltage and frequency scaling (DVFS) is
ubiquitous on mobile devices as a mechanism for saving
energy. Reducing the clock frequency of a processor
allows a corresponding reduction in power consumption,
as does turning off idle cores. Garbage collection is a
canonical example of the sort of memory-bound workload
that best responds to such scaling. Here, we explore
the impact of frequency scaling for garbage collection
in a real mobile device running Android's Dalvik
virtual machine, which uses a concurrent collector. By
controlling the frequency of the core on which the
concurrent collector thread runs we can reduce power
significantly. Running established multi-threaded
benchmarks shows that total processor energy can be
reduced up to 30\%, with end-to-end performance loss of
at most 10\%.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ISMM '15 conference proceedings.",
author = "Yongkweon Jeon and Sungroh Yoon",
title = "Multi-Threaded Hierarchical Clustering by Parallel
Nearest-Neighbor Chaining",
volume = "26",
number = "9",
pages = "2534--2548",
month = sep,
year = "2015",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Mon Sep 28 12:20:25 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
URL = "http://www.computer.org/csdl/trans/td/2015/09/06893001.pdf",
abstract-URL = "http://www.computer.org/csdl/trans/td/2015/09/06893001-abs.html",
acknowledgement = ack-nhfb,
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Mahmut Kandemir and Hui Zhao and Xulong Tang and
Mustafa Karakoy",
title = "Memory Row Reuse Distance and its Role in Optimizing
Application Performance",
journal = j-SIGMETRICS,
volume = "43",
number = "1",
pages = "137--149",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2796314.2745867",
ISSN = "0163-5999 (print), 1557-9484 (electronic)",
ISSN-L = "0163-5999",
bibdate = "Fri Sep 18 06:59:51 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Continuously increasing dataset sizes of large-scale
applications overwhelm on-chip cache capacities and
make the performance of last-level caches (LLC)
increasingly important. That is, in addition to
maximizing LLC hit rates, it is becoming equally
important to reduce LLC miss latencies. One of the
critical factors that influence LLC miss latencies is
row-buffer locality (i.e., the fraction of LLC misses
that hit in the large buffer attached to a memory
bank). While there has been a plethora of recent works
on optimizing row-buffer performance, to our knowledge,
there is no study that quantifies the full potential of
row-buffer locality and impact of maximizing it on
application performance. Focusing on multithreaded
applications, the first contribution of this paper is
the definition of a new metric called (memory) row
reuse distance (RRD). We show that, while intra-core
RRDs are relatively small (increasing the chances for
row-buffer hits), inter-core RRDs are quite large
(increasing the chances for row-buffer misses).
Motivated by this, we propose two schemes that measure
the maximum potential benefits that could be obtained
from minimizing RRDs, to the extent allowed by program
dependencies. Specifically, one of our schemes
(Scheme-I) targets only intra-core RRDs, whereas the
other one (Scheme-II) aims at reducing both intra-core
RRDs and inter-core RRDs. Our experimental evaluations
demonstrate that (i) Scheme-I reduces intra-core RRDs
but increases inter-core RRDs; (ii) Scheme-II reduces
inter-core RRDs significantly while achieving a similar
behavior to Scheme-I as far as intra-core RRDs are
concerned; (iii) Scheme-I and Scheme-II improve
execution times of our applications by 17\% and 21\%,
respectively, on average; and (iv) both our schemes
deliver consistently good results under different
memory request scheduling policies.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGMETRICS Performance Evaluation Review",
journal-URL = "http://portal.acm.org/toc.cfm?id=J618",
author = "Baris Kasikci and Cristian Zamfir and George Candea",
title = "Automated Classification of Data Races Under Both
Strong and Weak Memory Models",
journal = j-TOPLAS,
volume = "37",
number = "3",
pages = "8:1--8:??",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2734118",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jun 19 05:36:55 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Data races are one of the main causes of concurrency
problems in multithreaded programs. Whether all data
races are bad, or some are harmful and others are
harmless, is still the subject of vigorous scientific
debate [Narayanasamy et al. 2007; Boehm 2012]. What is
clear, however, is that today's code has many data
races [Kasikci et al. 2012; Jin et al. 2012; Erickson
et al. 2010], and fixing data races without introducing
bugs is time consuming [Godefroid and Nagappan 2008].
Therefore, it is important to efficiently identify data
races in code and understand their consequences to
prioritize their resolution. We present Portend$^+$, a
tool that not only detects races but also automatically
classifies them based on their potential consequences:
Could they lead to crashes or hangs? Could their
effects be visible outside the program? Do they appear
to be harmless? How do their effects change under weak
memory models? Our proposed technique achieves high
accuracy by efficiently analyzing multiple paths and
multiple thread schedules in combination, and by
performing symbolic comparison between program outputs.
We ran Portend$^+$ on seven real-world applications: it
detected 93 true data races and correctly classified 92
of them, with no human effort. Six of them were harmful
races. Portend$^+$ 's classification accuracy is up to
89\% higher than that of existing tools, and it
produces easy-to-understand evidence of the
consequences of ``harmful'' races, thus both proving
their harmfulness and making debugging easier. We
envision Portend$^+$ being used for testing and
debugging, as well as for automatically triaging bug
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Steve Kerrison and Kerstin Eder",
title = "Energy Modeling of Software for a Hardware
Multithreaded Embedded Microprocessor",
journal = j-TECS,
volume = "14",
number = "3",
pages = "56:1--56:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700104",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Sat Dec 9 08:08:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This article examines a hardware multithreaded
microprocessor and discusses the impact such an
architecture has on existing software energy modeling
techniques. A framework is constructed for analyzing
the energy behavior of the XMOS XS1-L multithreaded
processor and a variation on existing software energy
models is proposed, based on analysis of collected
energy data. It is shown that by combining execution
statistics with sufficient data on the processor's
thread activity and instruction execution costs, a
multithreaded software energy model used with
Instruction Set Simulation can yield an average error
margin of less than 7\%.",
acknowledgement = ack-nhfb,
articleno = "56",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Gokcen Kestor and Osman S. Unsal and Adrian Cristal
and Serdar Tasiran",
title = "{TRADE}: Precise Dynamic Race Detection for Scalable
Transactional Memory Systems",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2786021",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
abstract = "As other multithreaded programs, transactional memory
(TM) programs are prone to race conditions. Previous
work focuses on extending existing definitions of data
race for lock-based applications to TM applications,
which requires all transactions to be totally ordered
``as if'' serialized by a global lock. This approach
poses implementation constraints on the STM that
severely limits TM applications' performance. This
article shows that forcing total ordering among all
running transactions, while sufficient, is not
necessary. We introduce an alternative data race
definition, relaxed transactional data race, that
requires ordering of only conflicting transactions. The
advantages of our relaxed definition are twofold:
First, unlike the previous definition, this definition
can be applied to a wide range of TMs, including those
that do not enforce transaction total ordering. Second,
within a single execution, it exposes a higher number
of data races, which considerably reduces debugging
time. Based on this definition, we propose a novel and
precise race detection tool for C/C++ TM applications
(TRADE), which detects data races by tracking
happens-before edges among conflicting transactions.
Our experiments reveal that TRADE precisely detects
data races for STAMP applications running on modern
STMs with overhead comparable to state-of-the-art race
detectors for lock-based applications. Our experiments
also show that in a single run, TRADE identifies
several races not discovered by 10 separate runs of a
race detection tool based on the previous data race
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Onur Kocberber and Babak Falsafi and Boris Grot",
title = "Asynchronous memory access chaining",
volume = "9",
number = "4",
pages = "252--263",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
abstract = "In-memory databases rely on pointer-intensive data
structures to quickly locate data in memory. A single
lookup operation in such data structures often exhibits
long-latency memory stalls due to dependent pointer
dereferences. Hiding the memory latency by launching
additional memory accesses for other lookups is an
effective way of improving performance of
pointer-chasing codes (e.g., hash table probes, tree
traversals). The ability to exploit such inter-lookup
parallelism is beyond the reach of modern out-of-order
cores due to the limited size of their instruction
window. Instead, recent work has proposed software
prefetching techniques that exploit inter-lookup
parallelism by arranging a set of independent lookups
into a group or a pipeline, and navigate their
respective pointer chains in a synchronized fashion.
While these techniques work well for highly regular
access patterns, they break down in the face of
irregularity across lookups. Such irregularity includes
variable-length pointer chains, early exit, and
read/write dependencies. This work introduces
Asynchronous Memory Access Chaining (AMAC), a new
approach for exploiting inter-lookup parallelism to
hide the memory access latency. AMAC achieves high
dynamism in dealing with irregularity across lookups by
maintaining the state of each lookup separately from
that of other lookups. This feature enables AMAC to
initiate a new lookup as soon as any of the in-flight
lookups complete. In contrast, the static arrangement
of lookups into a group or pipeline in existing
techniques precludes such adaptivity. Our results show
that AMAC matches or outperforms state-of-the-art
prefetching techniques on regular access patterns,
while delivering up to 2.3x higher performance under
irregular data structure lookups. AMAC fully utilizes
the available microarchitectural resources, generating
the maximum number of memory accesses allowed by
hardware in both single- and multi-threaded execution
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Bartlomiej Jacek Kubica",
title = "Presentation of a highly tuned multithreaded interval
solver for underdetermined and well-determined
nonlinear systems",
volume = "70",
number = "4",
pages = "929--963",
month = dec,
year = "2015",
DOI = "https://doi.org/10.1007/s11075-015-9980-y",
ISSN = "1017-1398 (print), 1572-9265 (electronic)",
ISSN-L = "1017-1398",
bibdate = "Mon Jan 25 08:55:03 MST 2016",
bibsource = "http://link.springer.com/journal/11075/70/4;
URL = "http://link.springer.com/article/10.1007/s11075-015-9980-y;
acknowledgement = ack-nhfb,
fjournal = "Numerical Algorithms",
journal-URL = "http://link.springer.com/journal/11075",
author = "Bradley C. Kuszmaul",
title = "{SuperMalloc}: a super fast multithreaded {\tt malloc}
for 64-bit machines",
journal = j-SIGPLAN,
volume = "50",
number = "11",
pages = "41--55",
month = nov,
year = "2015",
DOI = "https://doi.org/10.1145/2887746.2754178",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:44 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "SuperMalloc is an implementation of malloc(3)
originally designed for X86 Hardware Transactional
Memory (HTM)@. It turns out that the same design
decisions also make it fast even without HTM@. For the
malloc-test benchmark, which is one of the most
difficult workloads for an allocator, with one thread
SuperMalloc is about 2.1 times faster than the best of
DLmalloc, JEmalloc, Hoard, and TBBmalloc; with 8
threads and HTM, SuperMalloc is 2.75 times faster; and
on 32 threads without HTM SuperMalloc is 3.4 times
faster. SuperMalloc generally compares favorably with
the other allocators on speed, scalability, speed
variance, memory footprint, and code size. SuperMalloc
achieves these performance advantages using less than
half as much code as the alternatives. SuperMalloc
exploits the fact that although physical memory is
always precious, virtual address space on a 64-bit
machine is relatively cheap. It allocates 2 chunks
which contain objects all the same size. To translate
chunk numbers to chunk metadata, SuperMalloc uses a
simple array (most of which is uncommitted to physical
memory). SuperMalloc takes care to avoid associativity
conflicts in the cache: most of the size classes are a
prime number of cache lines, and nonaligned huge
accesses are randomly aligned within a page. Objects
are allocated from the fullest non-full page in the
appropriate size class. For each size class,
SuperMalloc employs a 10-object per-thread cache, a
per-CPU cache that holds about a level-2-cache worth of
objects per size class, and a global cache that is
organized to allow the movement of many objects between
a per-CPU cache and the global cache using $ O(1) $
instructions. SuperMalloc prefetches everything it can
before starting a critical section, which makes the
critical sections run fast, and for HTM improves the
odds that the transaction will commit.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ISMM '15 conference proceedings.",
author = "Bo-Cheng Charles Lai and Kun-Chun Li and Guan-Ru Li
and Chin-Hsuan Chiang",
title = "Self adaptable multithreaded object detection on
embedded multicore systems",
journal = j-J-PAR-DIST-COMP,
volume = "78",
number = "??",
pages = "25--38",
month = apr,
year = "2015",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Mar 21 09:26:08 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731515000192",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315/",
author = "Akash Lal and Shaz Qadeer",
title = "{DAG} inlining: a decision procedure for
reachability-modulo-theories in hierarchical programs",
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "280--290",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737987",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "A hierarchical program is one with multiple procedures
but no loops or recursion. This paper studies the
problem of deciding reachability queries in
hierarchical programs where individual statements can
be encoded in a decidable logic (say in SMT). This
problem is fundamental to verification and most
directly applicable to doing bounded reachability in
programs, i.e., reachability under a bound on the
number of loop iterations and recursive calls. The
usual method of deciding reachability in hierarchical
programs is to first inline all procedures and then do
reachability on the resulting single-procedure program.
Such inlining unfolds the call graph of the program to
a tree and may lead to an exponential increase in the
size of the program. We design and evaluate a method
called DAG inlining that unfolds the call graph to a
directed acyclic graph (DAG) instead of a tree by
sharing the bodies of procedures at certain points
during inlining. DAG inlining can produce much more
compact representations than tree inlining.
Empirically, we show that it leads to significant
improvements in the running time of a state-of-the-art
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Dominique LaSalle and George Karypis",
title = "Multi-threaded modularity based graph clustering using
the multilevel paradigm",
journal = j-J-PAR-DIST-COMP,
volume = "76",
number = "??",
pages = "66--80",
month = feb,
year = "2015",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Mon Mar 9 10:30:03 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731514001750",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315/",
author = "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi",
title = "A Case Study in Reverse Engineering {GPGPUs}:
Outstanding Memory Handling Resources",
journal = j-COMP-ARCH-NEWS,
volume = "43",
number = "4",
pages = "15--21",
month = sep,
year = "2015",
DOI = "https://doi.org/10.1145/2927964.2927968",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Apr 22 17:03:53 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "During recent years, GPU micro-architectures have
changed dramatically, evolving into powerful many-core
deep-multithreaded platforms for parallel workloads.
While important micro-architectural modifications
continue to appear in every new generation of these
processors, unfortunately, little is known about the
details of these innovative designs. One of the key
questions in understanding GPUs is how they deal with
outstanding memory misses. Our goal in this study is to
find answers to this question. To this end, we develop
a set of micro-benchmarks in CUDA to understand the
outstanding memory requests handling resources.
Particularly, we study two NVIDIA GPGPUs (Fermi and
Kepler) and estimate their capability in handling
outstanding memory requests. We show that Kepler can
issue nearly 32X higher number of outstanding memory
requests, compared to Fermi. We explain this
enhancement by Kepler's architectural modifications in
outstanding memory request handling resources.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "HEART '15 conference proceedings.",
author = "Peng Liu and Xiangyu Zhang and Omer Tripp and Yunhui
title = "{Light}: replay via tightly bounded recording",
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "55--64",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2738001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Reproducing concurrency bugs is a prominent challenge.
Existing techniques either rely on recording very fine
grained execution information and hence have high
runtime overhead, or strive to log as little
information as possible but provide no guarantee in
reproducing a bug. We present Light, a technique that
features much lower overhead compared to techniques
based on fine grained recording, and that guarantees to
reproduce concurrent bugs. We leverage and formally
prove that recording flow dependences is the necessary
and sufficient condition to reproduce a concurrent bug.
The flow dependences, together with the thread local
orders that can be automatically inferred (and hence
not logged), are encoded as scheduling constraints. An
SMT solver is used to derive a replay schedule, which
is guaranteed to exist even though it may be different
from the original schedule. Our experiments show that
Light has only 44\% logging overhead, almost one order
of magnitude lower than the state of the art techniques
relying on logging memory accesses. Its space overhead
is only 10\% of those techniques. Light can also
reproduce all the bugs we have collected whereas
existing techniques miss some of them.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Nuno Machado and Brandon Lucia and Lu{\'\i}s
title = "Concurrency debugging with differential schedule
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "586--595",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737973",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present Symbiosis: a concurrency debugging
technique based on novel differential schedule
projections (DSPs). A DSP shows the small set of memory
operations and data-flows responsible for a failure, as
well as a reordering of those elements that avoids the
failure. To build a DSP, Symbiosis first generates a
full, failing, multithreaded schedule via thread path
profiling and symbolic constraint solving. Symbiosis
selectively reorders events in the failing schedule to
produce a non-failing, alternate schedule. A DSP
reports the ordering and data-flow differences between
the failing and non-failing schedules. Our evaluation
on buggy real-world software and benchmarks shows that,
in practical time, Symbiosis generates DSPs that both
isolate the small fraction of event orders and
data-flows responsible for the failure, and show which
event reorderings prevent failing. In our experiments,
DSPs contain 81\% fewer events and 96\% less data-flows
than the full failure-inducing schedules. Moreover, by
allowing developers to focus on only a few events, DSPs
reduce the amount of time required to find a valid
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Darko Makreshanski and Justin Levandoski and Ryan
title = "To lock, swap, or elide: on the interplay of hardware
transactional memory and lock-free indexing",
volume = "8",
number = "11",
pages = "1298--1309",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809990",
ISSN = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The release of hardware transactional memory (HTM) in
commodity CPUs has major implications on the design and
implementation of main-memory databases, especially on
the architecture of high-performance lock-free indexing
methods at the core of several of these systems. This
paper studies the interplay of HTM and lock-free
indexing methods. First, we evaluate whether HTM will
obviate the need for crafty lock-free index designs by
integrating it in a traditional B-tree architecture.
HTM performs well for simple data sets with small
fixed-length keys and payloads, but its benefits
disappear for more complex scenarios (e.g., larger
variable-length keys and payloads), making it
unattractive as a general solution for achieving high
performance. Second, we explore fundamental differences
between HTM-based and lock-free B-tree designs. While
lock-freedom entails design complexity and extra
mechanism, it has performance advantages in several
scenarios, especially high-contention cases where
readers proceed uncontested (whereas HTM aborts
readers). Finally, we explore the use of HTM as a
method to simplify lock-free design. We find that using
HTM to implement a multi-word compare-and-swap greatly
reduces lock-free programming complexity at the cost of
only a 10-15\% performance degradation. Our study uses
two state-of-the-art index implementations: a
memory-optimized B-tree extended with HTM to provide
multi-threaded concurrency and the Bw-tree lock-free
B-tree used in several Microsoft production
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
and Mateo Valero and Adrian Cristal",
title = "Thread Lock Section-Aware Scheduling on Asymmetric
Single-{ISA} Multi-Core",
volume = "14",
number = "2",
pages = "160--163",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2357805",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "As thread level parallelism in applications has
continued to expand, so has research in chip multi-core
processors. As more and more applications become
multi-threaded we expect to find a growing number of
threads executing on a machine. As a consequence, the
operating system will require increasingly larger
amounts of CPU time to schedule these threads
efficiently. Instead of perpetuating the trend of
performing more complex thread scheduling in the
operating system, we propose a scheduling mechanism
that can be efficiently implemented in hardware as
well. Our approach of identifying multi-threaded
application bottlenecks such as thread synchronization
sections complements the Fairness-aware Scheduler
method. It achieves an average speed up of 11.5 percent
(geometric mean) compared to the state-of-the-art
Fairness-aware Scheduler.",
acknowledgement = ack-nhfb,
affiliation = "Markovic, N (Reprint Author), Barcelona Supercomputing
Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
Daniel; Unsal, Osman; Valero, Mateo, Barcelona
Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
Politecn Cataluna, Barcelona Supercomputing Ctr,
E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
Intelligence Res Inst Spanish Natl Res, Barcelona,
author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
osman.unsal@bsc.es mateo.valero@bsc.es
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Asymmetric chip multiprocessor (ACMP); HW/SW thread
scheduling; multi-threaded applications",
number-of-cited-references = "17",
ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
research-areas = "Computer Science",
researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
times-cited = "7",
unique-id = "Markovic:2015:TLS",
web-of-science-categories = "Computer Science, Hardware \&
author = "George Matheou and Paraskevas Evripidou",
title = "Architectural Support for Data-Driven Execution",
journal = j-TACO,
volume = "11",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686874",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The exponential growth of sequential processors has
come to an end, and thus, parallel processing is
probably the only way to achieve performance growth. We
propose the development of parallel architectures based
on data-driven scheduling. Data-driven scheduling
enforces only a partial ordering as dictated by the
true data dependencies, which is the minimum
synchronization possible. This is very beneficial for
parallel processing because it enables it to exploit
the maximum possible parallelism. We provide
architectural support for data-driven execution for the
Data-Driven Multithreading (DDM) model. In the past,
DDM has been evaluated mostly in the form of virtual
machines. The main contribution of this work is the
development of a highly efficient hardware support for
data-driven execution and its integration into a
multicore system with eight cores on a Virtex-6 FPGA.
The DDM semantics make barriers and cache coherence
unnecessary, which reduces the synchronization
latencies significantly and makes the cache simpler.
The performance evaluation has shown that the support
for data-driven execution is very efficient with
negligible overheads. Our prototype can support very
small problem sizes (matrix $ 16 \times 16$) and
ultra-lightweight threads (block of $ 4 \times 4$) that
achieve speedups close to linear. Such results cannot
be achieved by software-based systems.",
acknowledgement = ack-nhfb,
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "W. P. McCartney and N. Sridhar",
title = "Stackless Multi-Threading for Embedded Systems",
journal = j-IEEE-TRANS-COMPUT,
volume = "64",
number = "10",
pages = "2940--2952",
month = "????",
year = "2015",
DOI = "https://doi.org/10.1109/TC.2014.2378256",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Oct 13 06:51:51 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Kshitij Mehta and Edgar Gabriel",
title = "Multi-Threaded Parallel {I/O} for {OpenMP}
journal = j-INT-J-PARALLEL-PROG,
volume = "43",
number = "2",
pages = "286--309",
month = apr,
year = "2015",
DOI = "https://doi.org/10.1007/s10766-014-0306-9",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Aug 8 12:34:16 MDT 2015",
bibsource = "http://link.springer.com/journal/10766/43/2;
URL = "http://link.springer.com/article/10.1007/s10766-014-0306-9",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Nathan Mitchell and Court Cutting and Eftychios
title = "{GRIDiron}: an interactive authoring and cognitive
training foundation for reconstructive plastic surgery
journal = j-TOG,
volume = "34",
number = "4",
pages = "43:1--43:??",
month = aug,
year = "2015",
DOI = "https://doi.org/10.1145/2766918",
ISSN = "0730-0301 (print), 1557-7368 (electronic)",
ISSN-L = "0730-0301",
bibdate = "Tue Jul 28 17:22:44 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tog/;
abstract = "We present an interactive simulation framework for
authoring surgical procedures of soft tissue
manipulation using physics-based simulation to animate
the flesh. This interactive authoring tool can be used
by clinical educators to craft three-dimensional
illustrations of the intricate maneuvers involved in
craniofacial repairs, in contrast to two-dimensional
sketches and still photographs which are the medium
used to describe these procedures in the traditional
surgical curriculum. Our virtual environment also
allows surgeons-in-training to develop cognitive skills
for craniofacial surgery by experimenting with
different approaches to reconstructive challenges,
adapting stock techniques to flesh regions with
nonstandard shape, and reach preliminary predictions
about the feasibility of a given repair plan. We use a
Cartesian grid-based embedded discretization of
nonlinear elasticity to maximize regularity, and expose
opportunities for aggressive multithreading and SIMD
accelerations. Using a grid-based approach facilitates
performance and scalability, but constrains our ability
to capture the topology of thin surgical incisions. We
circumvent this restriction by hybridizing the
grid-based discretization with an explicit hexahedral
mesh representation in regions where the embedding mesh
necessitates overlap or nonmanifold connectivity.
Finally, we detail how the front-end of our system can
run on lightweight clients, while the core simulation
capability can be hosted on a dedicated server and
delivered as a network service.",
acknowledgement = ack-nhfb,
articleno = "43",
fjournal = "ACM Transactions on Graphics",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778",
author = "Thomas Nelson and Geoffrey Belter and Jeremy G. Siek
and Elizabeth Jessup and Boyana Norris",
title = "Reliable Generation of High-Performance Matrix
journal = j-TOMS,
volume = "41",
number = "3",
pages = "18:1--18:27",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2629698",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Wed Jun 3 17:59:32 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Scientific programmers often turn to vendor-tuned
Basic Linear Algebra Subprograms (BLAS) to obtain
portable high performance. However, many numerical
algorithms require several BLAS calls in sequence, and
those successive calls do not achieve optimal
performance. The entire sequence needs to be optimized
in concert. Instead of vendor-tuned BLAS, a programmer
could start with source code in Fortran or C (e.g.,
based on the Netlib BLAS) and use a state-of-the-art
optimizing compiler. However, our experiments show that
optimizing compilers often attain only one-quarter of
the performance of hand-optimized code. In this
article, we present a domain-specific compiler for
matrix kernels, the Build to Order BLAS (BTO), that
reliably achieves high performance using a scalable
search algorithm for choosing the best combination of
loop fusion, array contraction, and multithreading for
data parallelism. The BTO compiler generates code that
is between 16\% slower and 39\% faster than
hand-optimized code.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Ph{\'u}c C. Nguy{\v{e}}n and David {Van Horn}",
title = "Relatively complete counterexamples for higher-order
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "446--456",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737971",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In this paper, we study the problem of generating
inputs to a higher-order program causing it to error.
We first approach the problem in the setting of PCF, a
typed, core functional language and contribute the
first relatively complete method for constructing
counterexamples for PCF programs. The method is
relatively complete with respect to a first-order
solver over the base types of PCF. In practice, this
means an SMT solver can be used for the effective,
automated generation of higher-order counterexamples
for a large class of programs. We achieve this result
by employing a novel form of symbolic execution for
higher-order programs. The remarkable aspect of this
symbolic execution is that even though symbolic
higher-order inputs and values are considered, the path
condition remains a first-order formula. Our handling
of symbolic function application enables the
reconstruction of higher-order counterexamples from
this first-order formula. After establishing our main
theoretical results, we sketch how to apply the
approach to untyped, higher-order, stateful languages
with first-class contracts and show how counterexample
generation can be used to detect contract violations in
this setting. To validate our approach, we implement a
tool generating counterexamples for erroneous modules
written in Racket.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Jared Pager and Reiley Jeyapaul and Aviral
title = "A Software Scheme for Multithreading on {CGRAs}",
journal = j-TECS,
volume = "14",
number = "1",
pages = "19:1--19:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2638558",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jan 22 06:25:23 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Recent industry trends show a drastic rise in the use
of hand-held embedded devices, from everyday
applications to medical (e.g., monitoring devices) and
critical defense applications (e.g., sensor nodes). The
two key requirements in the design of such devices are
their processing capabilities and battery life. There
is therefore an urgency to build high-performance and
power-efficient embedded devices, inspiring researchers
to develop novel system designs for the same. The use
of a coprocessor (application-specific hardware) to
offload power-hungry computations is gaining favor
among system designers to suit their power budgets. We
propose the use of CGRAs (Coarse-Grained Reconfigurable
Arrays) as a power-efficient coprocessor. Though CGRAs
have been widely used for streaming applications, the
extensive compiler support required limits its
applicability and use as a general purpose coprocessor.
In addition, a CGRA structure can efficiently execute
only one statically scheduled kernel at a time, which
is a serious limitation when used as an accelerator to
a multithreaded or multitasking processor. In this
work, we envision a multithreaded CGRA where multiple
schedules (or kernels) can be executed simultaneously
on the CGRA (as a coprocessor). We propose a
comprehensive software scheme that transforms the
traditionally single-threaded CGRA into a multithreaded
coprocessor to be used as a power-efficient accelerator
for multithreaded embedded processors. Our software
scheme includes (1) a compiler framework that
integrates with existing CGRA mapping techniques to
prepare kernels for execution on the multithreaded CGRA
and (2) a runtime mechanism that dynamically schedules
multiple kernels (offloaded from the processor) to
execute simultaneously on the CGRA coprocessor. Our
multithreaded CGRA coprocessor implementation thus
makes it possible to achieve improved power-efficient
computing in modern multithreaded embedded systems.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "J. F. P{\'e}rez and G. Casale and S. Pacheco-Sanchez",
title = "Estimating Computational Requirements in
Multi-Threaded Applications",
volume = "41",
number = "3",
pages = "264--278",
month = mar,
year = "2015",
DOI = "https://doi.org/10.1109/TSE.2014.2363472",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 19:49:24 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6926798",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Leo Porter and Michael A. Laurenzano and Ananta Tiwari
and Adam Jundt and William A. {Ward, Jr.} and Roy
Campbell and Laura Carrington",
title = "Making the Most of {SMT} in {HPC}: System- and
Application-Level Perspectives",
journal = j-TACO,
volume = "11",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This work presents an end-to-end methodology for
quantifying the performance and power benefits of
simultaneous multithreading (SMT) for HPC centers and
applies this methodology to a production system and
workload. Ultimately, SMT's value system-wide depends
on whether users effectively employ SMT at the
application level. However, predicting SMT's benefit
for HPC applications is challenging; by doubling the
number of threads, the application's characteristics
may change. This work proposes statistical modeling
techniques to predict the speedup SMT confers to HPC
applications. This approach, accurate to within 8\%,
uses only lightweight, transparent performance monitors
collected during a single run of the application.",
acknowledgement = ack-nhfb,
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Donald E. Porter and Michael D. Bond and Indrajit Roy
and Kathryn S. Mckinley and Emmett Witchel",
title = "Practical Fine-Grained Information Flow Control Using
journal = j-TOPLAS,
volume = "37",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2015",
DOI = "https://doi.org/10.1145/2638548",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Wed Jan 21 07:13:17 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Decentralized Information Flow Control (DIFC) is a
promising model for writing programs with powerful,
end-to-end security guarantees. Current DIFC systems
that run on commodity hardware can be broadly
categorized into two types: language-level and
operating system-level DIFC. Language solutions provide
no guarantees against security violations on system
resources such as files and sockets. Operating system
solutions mediate accesses to system resources but are
either inefficient or imprecise at monitoring the flow
of information through fine-grained program data
structures. This article describes Laminar, the first
system to implement DIFC using a unified set of
abstractions for OS resources and heap-allocated
objects. Programmers express security policies by
labeling data with secrecy and integrity labels and
access the labeled data in security methods. Laminar
enforces the security policies specified by the labels
at runtime. Laminar is implemented using a modified
Java virtual machine and a new Linux security module.
This article shows that security methods ease
incremental deployment and limit dynamic security
checks by retrofitting DIFC policies on four
application case studies. Replacing the applications'
ad hoc security policies changes less than 10\% of the
code and incurs performance overheads from 5\% to 56\%.
Compared to prior DIFC systems, Laminar supports a more
general class of multithreaded DIFC programs
efficiently and integrates language and OS
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Rance Rodrigues and Israel Koren and Sandip Kundu",
title = "Does the Sharing of Execution Units Improve
Performance\slash Power of Multicores?",
journal = j-TECS,
volume = "14",
number = "1",
pages = "17:1--17:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2680543",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jan 22 06:25:23 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Several studies and recent real-world designs have
promoted sharing of underutilized resources between
cores in a multicore processor to achieve better
performance/power. It has been argued that when
utilization of such resources is low, sharing has a
negligible impact on performance while offering
considerable area and power benefits. In this article,
we investigate the performance and performance/watt
implications of sharing large and underutilized
resources between pairs of cores in a multicore. We
first study sharing of the entire floating-point
datapath (including reservation stations and execution
units) by two cores, similar to AMD's Bulldozer. We
find that while this architecture results in power
savings for certain workload combinations, it also
results in significant performance loss of up to 28\%.
Next, we study an alternative sharing architecture
where only the floating-point execution units are
shared, while the individual cores retain their
reservation stations. This reduces the highest
performance loss to 14\%. We then extend the study to
include sharing of other large execution units that are
used infrequently, namely, the integer multiply and
divide units. Subsequently, we analyze the impact of
sharing hardware resources in Simultaneously
Multithreaded (SMT) processors where multiple threads
run concurrently on the same core. We observe that
sharing improves performance/watt at a negligible
performance cost only if the shared units have high
throughput. Sharing low-throughput units reduces both
performance and performance/watt. To increase the
throughput of the shared units, we propose the use of
Dynamic Voltage and Frequency Boosting (DVFB) of only
the shared units that can be placed on a separate
voltage island. Our results indicate that the use of
DVFB improves both performance and performance/watt by
as much as 22\% and 10\%, respectively.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Emmanuelle Saillard and Patrick Carribault and Denis
title = "Static\slash dynamic validation of {MPI} collective
communications in multi-threaded context",
journal = j-SIGPLAN,
volume = "50",
number = "8",
pages = "279--280",
month = aug,
year = "2015",
DOI = "https://doi.org/10.1145/2858788.2688548",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:42 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Scientific applications mainly rely on the MPI
parallel programming model to reach high performance on
supercomputers. The advent of manycore architectures
(larger number of cores and lower amount of memory per
core) leads to mix MPI with a thread-based model like
OpenMP. But integrating two different programming
models inside the same application can be tricky and
generate complex bugs. Thus, the correctness of hybrid
programs requires a special care regarding MPI calls
location. For example, identical MPI collective
operations cannot be performed by multiple
non-synchronized threads. To tackle this issue, this
paper proposes a static analysis and a reduced dynamic
instrumentation to detect bugs related to misuse of MPI
collective operations inside or outside threaded
regions. This work extends PARCOACH designed for
MPI-only applications and keeps the compatibility with
these algorithms. We validated our method on multiple
hybrid benchmarks and applications with a low
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '15 conference proceedings.",
author = "Malavika Samak and Murali Krishna Ramanathan and
Suresh Jagannathan",
title = "Synthesizing racy tests",
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "175--185",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737998",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "Subtle concurrency errors in multithreaded libraries
that arise because of incorrect or inadequate
synchronization are often difficult to pinpoint
precisely using only static techniques. On the other
hand, the effectiveness of dynamic race detectors is
critically dependent on multithreaded test suites whose
execution can be used to identify and trigger races.
Usually, such multithreaded tests need to invoke a
specific combination of methods with objects involved
in the invocations being shared appropriately to expose
a race. Without a priori knowledge of the race,
construction of such tests can be challenging. In this
paper, we present a lightweight and scalable technique
for synthesizing precisely these kinds of tests. Given
a multithreaded library and a sequential test suite, we
describe a fully automated analysis that examines
sequential execution traces, and produces as its output
a concurrent client program that drives shared objects
via library method calls to states conducive for
triggering a race. Experimental results on a variety of
well-tested Java libraries yield 101 synthesized
multithreaded tests in less than four minutes.
Analyzing the execution of these tests using an
off-the-shelf race detector reveals 187 harmful races,
including several previously unreported ones. Our
implementation, named NARADA, and the results of our
experiments are available at
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "P. Schweitzer and S. Cipi{\`e}re and A. Dufaure and H.
Payno and Y. Perrot and D. R. C. Hill and L. Maigne",
title = "Performance Evaluation of Multithreaded {Geant4}
Simulations Using an {Intel Xeon Phi} Cluster",
journal = j-SCI-PROG,
volume = "2015",
number = "??",
pages = "980752:1--980752:10",
month = "????",
year = "2015",
DOI = "https://doi.org/10.1155/2015/980752",
ISSN = "1058-9244 (print), 1875-919X (electronic)",
ISSN-L = "1058-9244",
bibdate = "Tue Sep 20 07:53:44 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://www.hindawi.com/journals/sp/2015/980752/",
acknowledgement = ack-nhfb,
fjournal = "Scientific Programming",
journal-URL = "https://www.hindawi.com/journals/sp/",
journalabr = "Sci. Prog",
author = "Qingchuan Shi and Henry Hoffmann and Omer Khan",
title = "A Cross-Layer Multicore Architecture to Tradeoff
Program Accuracy and Resilience Overheads",
volume = "14",
number = "2",
pages = "85--89",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2365204",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "To protect multicores from soft-error perturbations,
resiliency schemes have been developed with high
coverage but high power/performance overheads (similar
to 2x). We observe that not all soft-errors affect
program correctness, some soft-errors only affect
program accuracy, i.e., the program completes with
certain acceptable deviations from soft-error free
outcome. Thus, it is practical to improve processor
efficiency by trading off resilience overheads with
program accuracy. We propose the idea of declarative
resilience that selectively applies resilience schemes
to both crucial and non-crucial code, while ensuring
program correctness. At the application level, crucial
and non-crucial code is identified based on its impact
on the program outcome. The hardware collaborates with
software support to enable efficient resilience with
100 percent soft-error coverage. Only program accuracy
is compromised in the worst-case scenario of a
soft-error strike during non-crucial code execution.
For a set of multithreaded benchmarks, declarative
resilience improves completion time by an average of 21
percent over state-of-the-art hardware resilience
scheme that protects all executed code. Its performance
overhead is similar to 1.38x over a multicore that does
not support resilience.",
acknowledgement = ack-nhfb,
affiliation = "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
\& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
Dept Comp Sci, Chicago, IL 60637 USA.",
author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "multicores; program accuracy; Resilience;
number-of-cited-references = "23",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Shi:2015:CLM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Youjip Won and Kyeongyeol Lim and Jaehong Min",
title = "{MUCH}: Multithreaded Content-Based File Chunking",
journal = j-IEEE-TRANS-COMPUT,
volume = "64",
number = "5",
pages = "1375--1388",
month = "????",
year = "2015",
DOI = "https://doi.org/10.1109/TC.2014.2322600",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Jun 4 19:46:44 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Junchao Zhang and Babak Behzad and Marc Snir",
title = "Design of a Multithreaded {Barnes--Hut} Algorithm for
Multicore Clusters",
volume = "26",
number = "7",
pages = "1861--1873",
month = jul,
year = "2015",
DOI = "https://doi.org/10.1109/TPDS.2014.2331243",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Mon Aug 3 11:58:51 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
URL = "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html",
abstract-URL = "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html",
acknowledgement = ack-nhfb,
journal-URL = "http://www.computer.org/tpds/archives.htm",
author = "Naling Zhang and Markus Kusano and Chao Wang",
title = "Dynamic partial order reduction for relaxed memory
journal = j-SIGPLAN,
volume = "50",
number = "6",
pages = "250--259",
month = jun,
year = "2015",
DOI = "https://doi.org/10.1145/2813885.2737956",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:41 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Under a relaxed memory model such as TSO or PSO, a
concurrent program running on a shared-memory
multiprocessor may observe two types of nondeterminism:
the nondeterminism in thread scheduling and the
nondeterminism in store buffering. Although there is a
large body of work on mitigating the scheduling
nondeterminism during runtime verification, methods for
soundly mitigating the store buffering nondeterminism
are lacking. We propose a new dynamic partial order
reduction (POR) algorithm for verifying concurrent
programs under TSO and PSO. Our method relies on
modeling both types of nondeterminism in a unified
framework, which allows us to extend existing POR
techniques to TSO and PSO without overhauling the
verification algorithm. In addition to sound POR, we
also propose a buffer-bounding method for more
aggressively reducing the state space. We have
implemented our new methods in a stateless model
checking tool and demonstrated their effectiveness on a
set of multithreaded C benchmarks.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '15 conference proceedings.",
author = "Minjia Zhang and Jipeng Huang and Man Cao and Michael
D. Bond",
title = "Low-overhead software transactional memory with
progress guarantees and strong semantics",
journal = j-SIGPLAN,
volume = "50",
number = "8",
pages = "97--108",
month = aug,
year = "2015",
DOI = "https://doi.org/10.1145/2858788.2688510",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Tue Feb 16 12:01:42 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "Software transactional memory offers an appealing
alternative to locks by improving programmability,
reliability, and scalability. However, existing STMs
are impractical because they add high instrumentation
costs and often provide weak progress guarantees and/or
semantics. This paper introduces a novel STM called
LarkTM that provides three significant features. (1)
Its instrumentation adds low overhead except when
accesses actually conflict, enabling low single-thread
overhead and scaling well on low-contention workloads.
(2) It uses eager concurrency control mechanisms, yet
naturally supports flexible conflict resolution,
enabling strong progress guarantees. (3) It naturally
provides strong atomicity semantics at low cost.
LarkTM's design works well for low-contention
workloads, but adds significant overhead under higher
contention, so we design an adaptive version of LarkTM
that uses alternative concurrency control for
high-contention objects. An implementation and
evaluation in a Java virtual machine show that the
basic and adaptive versions of LarkTM not only provide
low single-thread overhead, but their multithreaded
performance compares favorably with existing
high-performance STMs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '15 conference proceedings.",
author = "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
title = "Adaptive Cache and Concurrency Allocation on
volume = "14",
number = "2",
pages = "90--93",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2359882",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Memory bandwidth is critical to GPGPU performance.
Exploiting locality in caches can better utilize memory
bandwidth. However, memory requests issued by excessive
threads cause cache thrashing and saturate memory
bandwidth, degrading performance. In this paper, we
propose adaptive cache and concurrency allocation (CCA)
to prevent cache thrashing and improve the utilization
of bandwidth and computational resources, hence
improving performance. According to locality and reuse
distance of access patterns in GPGPU program, warps on
a stream multiprocessor are dynamically divided into
three groups: cached, bypassed, and waiting. The data
cache accommodates the footprint of cached warps.
Bypassed warps cannot allocate cache lines in the data
cache to prevent cache thrashing, but are able to take
advantage of available memory bandwidth and
computational resource. Waiting warps are de-scheduled.
Experimental results show that adaptive CCA can
significant improve benchmark performance, with 80
percent harmonic mean IPC improvement over the
acknowledgement = ack-nhfb,
affiliation = "Zheng, Z (Reprint Author), Natl Univ Def Technol,
State Key Lab High Performance Comp, Changsha, Hunan,
Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
Def Technol, State Key Lab High Performance Comp,
Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
[61070037, 61272143, 61272144, 61103016, 61202121];
NUDT [B120607]; RFDP [20114307120013]; NSF
funding-text = "This work was partially supported by CSC, 863 Program
(2012AA010905), NSFC (61070037, 61272143, 61272144,
61103016, 61202121), NUDT(B120607), RFDP
(20114307120013), and NSF (CCF-1318298).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; adaptive cache-and-concurrency
allocation; Bandwidth; bandwidth utilization
improvement; benchmark performance improvement;
Benchmark testing; bypassed warps; cache; cache lines;
cache locality; Cache memory; cache storage; cache
thrashing prevention; cached warps; CCA; computational
resource utilization improvement; concurrency;
concurrency control; Concurrent computing; GPGPU; GPGPU
performance improvement; graphics processing units;
harmonic mean IPC improvement; Instruction sets; memory
bandwidth saturation; multi-threading; multiprocessing
systems; performance evaluation; Resource management;
reuse distance; stream multiprocessor; waiting warp
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Zheng:2015:ACC",
web-of-science-categories = "Computer Science, Hardware \&
author = "N. Altiparmak and A. S. Tosun",
title = "Multithreaded Maximum Flow Based Optimal Replica
Selection Algorithm for Heterogeneous Storage
journal = j-IEEE-TRANS-COMPUT,
volume = "65",
number = "5",
pages = "1543--1557",
month = may,
year = "2016",
DOI = "https://doi.org/10.1109/TC.2015.2451620",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Fri Apr 15 13:39:43 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Mohammad Arjomand and Mahmut T. Kandemir and Anand
Sivasubramaniam and Chita R. Das",
title = "Boosting access parallelism to {PCM}-based main
journal = j-COMP-ARCH-NEWS,
volume = "44",
number = "3",
pages = "695--706",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/3007787.3001211",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Jan 12 18:43:43 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Despite its promise as a DRAM main memory replacement,
Phase Change Memory (PCM) has high write latencies
which can be a serious detriment to its widespread
adoption. Apart from slowing down a write request, the
consequent high latency can also keep other chips of
the same rank, that are not involved in this write,
idle for long times. There are several practical
considerations that make it difficult to allow
subsequent reads and/or writes to be served
concurrently from the same chips during the long
latency write. This paper proposes and evaluates
several novel mechanisms --- re-constructing data from
error correction bits instead of waiting for chips
currently busy to serve a read, rotating word mappings
across chips of a PCM rank, and rotating the mapping of
error detection/correction bits across these chips ---
to overlap several reads with an ongoing write (RoW)
and even a write with an ongoing write (WoW). The paper
also presents the necessary micro-architectural
enhancements needed to implement these mechanisms,
without significantly changing the current interfaces.
The resulting PCM access parallelism (PCMap) system
incorporating these enhancements, boosts the
intra-rank-level parallelism during such writes from a
very low baseline value of 2.4 to an average and
maximum values of 4.5 and 7.4, respectively (out of a
maximum of 8.0), across a wide spectrum of both
multiprogrammed and multithreaded workloads. This boost
in parallelism results in an average IPC improvement of
15.6\% and 16.7\% for the multiprogrammed and
multithreaded workloads, respectively.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ISCA '16 conference proceedings.",
author = "Michael Badamo and Jeff Casarona and Minshu Zhao and
Donald Yeung",
title = "Identifying Power-Efficient Multicore Cache
Hierarchies via Reuse Distance Analysis",
journal = j-TOCS,
volume = "34",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2016",
DOI = "https://doi.org/10.1145/2851503",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Sat May 21 08:09:53 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "To enable performance improvements in a
power-efficient manner, computer architects have been
building CPUs that exploit greater amounts of
thread-level parallelism. A key consideration in such
CPUs is properly designing the on-chip cache hierarchy.
Unfortunately, this can be hard to do, especially for
CPUs with high core counts and large amounts of cache.
The enormous design space formed by the combinatorial
number of ways in which to organize the cache hierarchy
makes it difficult to identify power-efficient
configurations. Moreover, the problem is exacerbated by
the slow speed of architectural simulation, which is
the primary means for conducting such design space
studies. A powerful tool that can help architects
optimize CPU cache hierarchies is reuse distance (RD)
analysis. Recent work has extended uniprocessor RD
techniques-i.e., by introducing concurrent RD and
private-stack RD profiling-to enable analysis of
different types of caches in multicore CPUs. Once
acquired, parallel locality profiles can predict the
performance of numerous cache configurations,
permitting highly efficient design space exploration.
To date, existing work on multicore RD analysis has
focused on developing the profiling techniques and
assessing their accuracy. Unfortunately, there has been
no work on using RD analysis to optimize CPU
performance or power consumption. This article
investigates applying multicore RD analysis to identify
the most power efficient cache configurations for a
multicore CPU. First, we develop analytical models that
use the cache-miss counts from parallel locality
profiles to estimate CPU performance and power
consumption. Although future scalable CPUs will likely
employ multithreaded (and even out-of-order) cores, our
current study assumes single-threaded in-order cores to
simplify the models, allowing us to focus on the cache
hierarchy and our RD-based techniques. Second, to
demonstrate the utility of our techniques, we apply our
models to optimize a large-scale tiled CPU architecture
with a two-level cache hierarchy. We show that the most
power efficient configuration varies considerably
across different benchmarks, and that our locality
profiles provide deep insights into why certain
configurations are power efficient. We also show that
picking the best configuration can provide significant
gains, as there is a 2.01x power efficiency spread
across our tiled CPU design space. Finally, we validate
the accuracy of our techniques using detailed
simulation. Among several simulated configurations, our
techniques can usually pick the most power efficient
configuration, or one that is very close to the best.
In addition, across all simulated configurations, we
can predict power efficiency with 15.2\% error.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
author = "Jonathan Balkind and Michael McKeown and Yaosheng Fu
and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
Mohammad Shahrad and Adi Fuchs and Samuel Payne and
Xiaohua Liang and Matthew Matl and David Wentzlaff",
title = "{OpenPiton}: an Open Source Manycore Research
journal = j-OPER-SYS-REV,
volume = "50",
number = "2",
pages = "217--232",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/2954680.2872414",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Thu Jun 9 17:03:34 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
abstract = "Industry is building larger, more complex, manycore
processors on the back of strong institutional
knowledge, but academic projects face difficulties in
replicating that scale. To alleviate these difficulties
and to develop and share knowledge, the community needs
open architecture frameworks for simulation, synthesis,
and software exploration which support extensibility,
scalability, and configurability, alongside an
established base of verification tools and supported
software. In this paper we present OpenPiton, an open
source framework for building scalable architecture
research prototypes from 1 core to 500 million cores.
OpenPiton is the world's first open source,
general-purpose, multithreaded manycore processor and
framework. OpenPiton leverages the industry hardened
OpenSPARC T1 core with modifications and builds upon it
with a scratch-built, scalable uncore creating a
flexible, modern manycore design. In addition,
OpenPiton provides synthesis and backend scripts for
ASIC and FPGA to enable other researchers to bring
their designs to implementation. OpenPiton provides a
complete verification infrastructure of over 8000
tests, is supported by mature software tools, runs
full-stack multiuser Debian Linux, and is written in
industry standard Verilog. Multiple implementations of
OpenPiton have been created including a taped-out
25-core implementation in IBM's 32nm process and
multiple Xilinx FPGA prototypes.",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597",
author = "Man Cao and Minjia Zhang and Aritra Sengupta and
Michael D. Bond",
title = "Drinking from both glasses: combining pessimistic and
optimistic tracking of cross-thread dependences",
journal = j-SIGPLAN,
volume = "51",
number = "8",
pages = "20:1--20:??",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1145/3016078.2851143",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "It is notoriously challenging to develop parallel
software systems that are both scalable and correct.
Runtime support for parallelism---such as multithreaded
record {\&} replay, data race detectors, transactional
memory, and enforcement of stronger memory
models---helps achieve these goals, but existing
commodity solutions slow programs substantially in
order to track (i.e., detect or control) an execution's
cross-thread dependences accurately. Prior work tracks
cross-thread dependences either ``pessimistically,''
slowing every program access, or ``optimistically,''
allowing for lightweight instrumentation of most
accesses but dramatically slowing accesses involved in
cross-thread dependences. This paper seeks to hybridize
pessimistic and optimistic tracking, which is
challenging because there exists a fundamental mismatch
between pessimistic and optimistic tracking. We address
this challenge based on insights about how dependence
tracking and program synchronization interact, and
introduce a novel approach called hybrid tracking.
Hybrid tracking is suitable for building efficient
runtime support, which we demonstrate by building
hybrid-tracking-based versions of a dependence recorder
and a region serializability enforcer. An adaptive,
profile-based policy makes runtime decisions about
switching between pessimistic and optimistic tracking.
Our evaluation shows that hybrid tracking enables
runtime support to overcome the performance limitations
of both pessimistic and optimistic tracking alone.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '16 conference proceedings.",
author = "Kuan-Hsun Chen and Jian-Jia Chen and Florian Kriebel
and Semeen Rehman and Muhammad Shafique and J{\"o}rg
title = "Task Mapping for Redundant Multithreading in
Multi-Cores with Reliability and Performance
journal = j-IEEE-TRANS-COMPUT,
volume = "65",
number = "11",
pages = "3441--3455",
month = nov,
year = "2016",
DOI = "https://doi.org/10.1109/TC.2016.2532862",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Oct 11 05:14:24 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Timothy Creech and Rajeev Barua",
title = "Transparently Space Sharing a Multicore Among Multiple
journal = j-TOPC,
volume = "3",
number = "3",
pages = "17:1--17:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001910",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Dec 26 17:40:41 MST 2016",
bibsource = "http://topc.acm.org/;
abstract = "As hardware becomes increasingly parallel and the
availability of scalable parallel software improves,
the problem of managing multiple multithreaded
applications (processes) becomes important. Malleable
processes, which can vary the number of threads used as
they run, enable sophisticated and flexible resource
management. Although many existing applications
parallelized for SMPs with parallel runtimes are in
fact already malleable, deployed runtime environments
provide no interface nor any strategy for intelligently
allocating hardware threads or even preventing
oversubscription. Prior research methods either depend
on profiling applications ahead of time to make good
decisions about allocations or do not account for
process efficiency at all, leading to poor performance.
None of these prior methods have been adapted widely in
practice. This article presents the Scheduling and
Allocation with Feedback (SCAF) system: a drop-in
runtime solution that supports existing malleable
applications in making intelligent allocation decisions
based on observed efficiency without any changes to
semantics, program modification, offline profiling, or
even recompilation. Our existing implementation can
control most unmodified OpenMP applications. Other
malleable threading libraries can also easily be
supported with small modifications without requiring
application modification or recompilation. In this
work, we present the SCAF daemon and a SCAF-aware port
of the GNU OpenMP runtime. We present a new technique
for estimating process efficiency purely at runtime
using available hardware counters and demonstrate its
effectiveness in aiding allocation decisions. We
evaluated SCAF using NAS NPB parallel benchmarks on
five commodity parallel platforms, enumerating
architectural features and their effects on our scheme.
We measured the benefit of SCAF in terms of sum of
speedups improvement (a common metric for
multiprogrammed environments) when running all
benchmark pairs concurrently compared to
equipartitioning-the best existing competing scheme in
the literature. We found that SCAF improves on
equipartitioning on four out of five machines, showing
a mean improvement factor in sum of speedups of 1.04 to
1.11x for benchmark pairs, depending on the machine,
and 1.09x on average. Since we are not aware of any
widely available tool for equipartitioning, we also
compare SCAF against multiprogramming using unmodified
OpenMP, which is the only environment available to end
users today. SCAF improves on the unmodified OpenMP
runtimes for all five machines, with a mean improvement
of 1.08 to 2.07x, depending on the machine, and 1.59x
on average.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Benoit Daloze and Stefan Marr and Daniele Bonetta and
Hanspeter M{\"o}ssenb{\"o}ck",
title = "Efficient and thread-safe objects for
dynamically-typed languages",
journal = j-SIGPLAN,
volume = "51",
number = "10",
pages = "642--659",
month = oct,
year = "2016",
DOI = "https://doi.org/10.1145/3022671.2984001",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:13 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We are in the multi-core era. Dynamically-typed
languages are in widespread use, but their support for
multithreading still lags behind. One of the reasons is
that the sophisticated techniques they use to
efficiently represent their dynamic object models are
often unsafe in multithreaded environments. This paper
defines safety requirements for dynamic object models
in multithreaded environments. Based on these
requirements, a language-agnostic and thread-safe
object model is designed that maintains the efficiency
of sequential approaches. This is achieved by ensuring
that field reads do not require synchronization and
field updates only need to synchronize on objects
shared between threads. Basing our work on
JRuby+Truffle, we show that our safe object model has
zero overhead on peak performance for thread-local
objects and only 3\% average overhead on parallel
benchmarks where field updates require synchronization.
Thus, it can be a foundation for safe and efficient
multithreaded VMs for a wide range of dynamic
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '16 conference proceedings.",
author = "Etem Deniz and Alper Sen",
title = "Using Machine Learning Techniques to Detect Parallel
Patterns of Multi-threaded Applications",
journal = j-INT-J-PARALLEL-PROG,
volume = "44",
number = "4",
pages = "867--900",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1007/s10766-015-0396-z",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Tue Sep 20 10:50:00 MDT 2016",
bibsource = "http://link.springer.com/journal/10766/44/4;
URL = "http://link.springer.com/article/10.1007/s10766-015-0396-z",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Tyler Denniston and Shoaib Kamil and Saman
title = "Distributed {Halide}",
journal = j-SIGPLAN,
volume = "51",
number = "8",
pages = "5:1--5:??",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1145/3016078.2851157",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Many image processing tasks are naturally expressed as
a pipeline of small computational kernels known as
stencils. Halide is a popular domain-specific language
and compiler designed to implement image processing
algorithms. Halide uses simple language constructs to
express what to compute and a separate scheduling
co-language for expressing when and where to perform
the computation. This approach has demonstrated
performance comparable to or better than hand-optimized
code. Until now, however, Halide has been restricted to
parallel shared memory execution, limiting its
performance for memory-bandwidth-bound pipelines or
large-scale image processing tasks. We present an
extension to Halide to support distributed-memory
parallel execution of complex stencil pipelines. These
extensions compose with the existing scheduling
constructs in Halide, allowing expression of complex
computation and communication strategies. Existing
Halide applications can be distributed with minimal
changes, allowing programmers to explore the tradeoff
between recomputation and communication with little
effort. Approximately 10 new of lines code are needed
even for a 200 line, 99 stage application. On nine
image processing benchmarks, our extensions give up to
a 1.4$ \times $ speedup on a single node over regular
multithreaded execution with the same number of cores,
by mitigating the effects of non-uniform memory access.
The distributed benchmarks achieve up to 18$ \times $
speedup on a 16 node testing machine and up to 57$
\times $ speedup on 64 nodes of the NERSC Cori
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '16 conference proceedings.",
author = "Andreas Diavastos and Pedro Trancoso and Mikel
Luj{\'a}n and Ian Watson",
title = "Integrating Transactions into the Data-Driven
Multi-threading Model Using the {TFlux} Platform",
journal = j-INT-J-PARALLEL-PROG,
volume = "44",
number = "2",
pages = "257--277",
month = apr,
year = "2016",
DOI = "https://doi.org/10.1007/s10766-015-0369-2",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Thu Apr 7 12:08:24 MDT 2016",
bibsource = "http://link.springer.com/journal/10766/44/2;
URL = "http://link.springer.com/article/10.1007/s10766-015-0369-2",
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Saumay Dublish and Vijay Nagarajan and Nigel Topham",
title = "Cooperative Caching for {GPUs}",
journal = j-TACO,
volume = "13",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001589",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The rise of general-purpose computing on GPUs has
influenced architectural innovation on them. The
introduction of an on-chip cache hierarchy is one such
innovation. High L1 miss rates on GPUs, however,
indicate inefficient cache usage due to myriad factors,
such as cache thrashing and extensive multithreading.
Such high L1 miss rates in turn place high demands on
the shared L2 bandwidth. Extensive congestion in the L2
access path therefore results in high memory access
latencies. In memory-intensive applications, these
latencies get exposed due to a lack of active compute
threads to mask such high latencies. In this article,
we aim to reduce the pressure on the shared L2
bandwidth, thereby reducing the memory access latencies
that lie in the critical path. We identify significant
replication of data among private L1 caches, presenting
an opportunity to reuse data among L1s. We further show
how this reuse can be exploited via an L1 Cooperative
Caching Network (CCN), thereby reducing the bandwidth
demand on L2. In the proposed architecture, we connect
the L1 caches with a lightweight ring network to
facilitate intercore communication of shared data. We
show that this technique reduces traffic to the L2
cache by an average of 29\%, freeing up the bandwidth
for other accesses. We also show that the CCN reduces
the average memory latency by 24\%, thereby reducing
core stall cycles by 26\% on average. This translates
into an overall performance improvement of 14.7\% on
average (and up to 49\%) for applications that exhibit
reuse across L1 caches. In doing so, the CCN incurs a
nominal area and energy overhead of 1.3\% and 2.5\%,
respectively. Notably, the performance improvement with
our proposed CCN compares favorably to the performance
improvement achieved by simply doubling the number of
L2 banks by up to 34\%.",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Dmitry Evtyushkin and Dmitry Ponomarev and Nael
title = "Understanding and Mitigating Covert Channels Through
Branch Predictors",
journal = j-TACO,
volume = "13",
number = "1",
pages = "10:1--10:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2870636",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
abstract = "Covert channels through shared processor resources
provide secret communication between two malicious
processes: the trojan and the spy. In this article, we
classify, analyze, and compare covert channels through
dynamic branch prediction units in modern processors.
Through experiments on a real hardware platform, we
compare contention-based channel and the channel that
is based on exploiting the branch predictor's residual
state. We analyze these channels in SMT and
single-threaded environments under both clean and noisy
conditions. Our results show that the residual
state-based channel provides a cleaner signal and is
effective even in noisy execution environments with
another application sharing the same physical core with
the trojan and the spy. We also estimate the capacity
of the branch predictor covert channels and describe a
software-only mitigation technique that is based on
randomizing the state of the predictor tables on
context switches. We show that this protection
eliminates all covert channels through the branch
prediction unit with minimal impact on performance.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "J. Feliu and J. Sahuquillo and S. Petit and J. Duato",
title = "Bandwidth-Aware On-Line Scheduling in {SMT}
journal = j-IEEE-TRANS-COMPUT,
volume = "65",
number = "2",
pages = "422--434",
month = "????",
year = "2016",
DOI = "https://doi.org/10.1109/TC.2015.2428694",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Jan 19 07:06:51 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Pawan Harish and Mentar Mahmudi and Beno{\^\i}t {Le
Callennec} and Ronan Boulic",
title = "Parallel Inverse Kinematics for Multithreaded
journal = j-TOG,
volume = "35",
number = "2",
pages = "19:1--19:??",
month = may,
year = "2016",
DOI = "https://doi.org/10.1145/2887740",
ISSN = "0730-0301 (print), 1557-7368 (electronic)",
ISSN-L = "0730-0301",
bibdate = "Mon Jun 20 09:13:19 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tog/;
abstract = "In this article, we present a parallel prioritized
Jacobian-based inverse kinematics algorithm for
multithreaded architectures. We solve damped least
squares inverse kinematics using a parallel line search
by identifying and sampling critical input parameters.
Parallel competing execution paths are spawned for each
parameter in order to select the optimum that minimizes
the error criteria. Our algorithm is highly scalable
and can handle complex articulated bodies at
interactive frame rates. We show results on complex
skeletons consisting of more than 600 degrees of
freedom while being controlled using multiple end
effectors. We implement the algorithm both on multicore
and GPU architectures and demonstrate how the GPU can
further exploit fine-grain parallelism not directly
available on a multicore processor. Our implementations
are 10 to 150 times faster compared to a
state-of-the-art serial implementation while providing
higher accuracy. We also demonstrate the scalability of
the algorithm over multiple scenarios and explore the
GPU implementation in detail.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Graphics",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778",
author = "Milad Hashemi and Debbie Marr and Doug Carmean and
Yale N. Patt",
title = "Efficient Execution of Bursty Applications",
volume = "15",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2456013",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "The performance of user-facing applications is
critical to client platforms. Many of these
applications are event-driven and exhibit ``bursty''
behavior: the application is generally idle but
generates bursts of activity in response to human
interaction. We study one example of a bursty
application, web-browsers, and produce two important
insights: (1) Activity bursts contain false
parallelism, bringing many cores out of a deep sleep to
inefficiently render a single webpage, and (2) these
bursts are highly compute driven, and thus scale nearly
linearly with frequency. We show average performance
gains/energy reductions of 14\%/17\% respectively on
real hardware by statically moving threads from
multiple cores to a single core. We then propose
dynamic hardware driven thread migration and scheduling
enhancements that detect these bursts, leading to
further benefits.",
acknowledgement = ack-nhfb,
affiliation = "Hashemi, M (Reprint Author), Univ Texas Austin, Elect
\& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad;
Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn,
Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel
Labs, Portland, OR USA. Carmean, Doug, Microsoft,
Microsoft Res, Seattle, WA USA.",
author-email = "miladh@hps.utexas.edu debbie.marr@intel.com
dcarmean@microsoft.com patt@hps.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS
Research Group",
funding-text = "The authors thank Intel Corporation and the Cockrell
Foundation for their continued generous financial
support of the HPS Research Group.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Browsers; bursty applications; dynamic hardware;
Energy; energy reductions; Hardware; human computer
interaction; human interaction; Instruction sets;
Internet; Loading; multi-threading; Multicore
processing; multiple cores; multiprocessing systems;
online front-ends; Operating systems; performance;
performance evaluation; performance gains; power aware
computing; thread migration; thread scheduling;
Web-browsers; Webpage; webpages; webpages, thread
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hashemi:2016:EEB",
web-of-science-categories = "Computer Science, Hardware \&
author = "Qi Hu and Peng Liu and Michael C. Huang",
title = "Threads and Data Mapping: Affinity Analysis for
Traffic Reduction",
volume = "15",
number = "2",
pages = "133--136",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2451172",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Modern processors spend significant amount of time and
energy moving data. With the increase in core count,
the relative importance of such latency and energy
expenditure will only increase with time. Inter-core
communication traffic when executing a multithreaded
application is one such source of latency and energy
expenditure. This traffic is influenced by the mapping
of threads and data onto multicore systems. This paper
investigates the impact of threads and data mapping on
traffic in a chip-multiprocessor, and exploits the
potential for traffic reduction through threads and
data mapping. Based on the analysis and estimation of
the lowest traffic, we propose a threads and data
mapping mechanism to approach the lowest traffic. The
mapping takes both the correlation among threads and
the affinity of data with individual threads into
account, and results in significant traffic reduction
and energy savings.",
acknowledgement = ack-nhfb,
affiliation = "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat
Sci \& Elect Engn, Hangzhou 310027, Peoples R China.
Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \&
Elect Engn, Hangzhou 310027, Peoples R China. Huang,
Michael C., Univ Rochester, Dept Elect \& Comp Engn,
601 Elmwood Ave, Rochester, NY 14627 USA.",
author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSFC [61028004]; US National Science
Foundation (NSF) [1217662, 1255729]; Open Project
Program of the State Key Laboratory of Mathematical
Engineering and Advanced Computing [2014A08, 2015A09]",
funding-text = "This work was supported by NSFC under grant 61028004,
and also in part by US National Science Foundation
(NSF) under grants 1217662 and 1255729, and the Open
Project Program of the State Key Laboratory of
Mathematical Engineering and Advanced Computing under
grants 2014A08 and 2015A09. P. Liu is the corresponding
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Mapping; memory; multicore; network-on-chip",
keywords-plus = "NETWORKS; CACHES; CHIP",
number-of-cited-references = "11",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hu:2016:TDM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Shiyou Huang and Jeff Huang",
title = "Maximal causality reduction for {TSO} and {PSO}",
journal = j-SIGPLAN,
volume = "51",
number = "10",
pages = "447--461",
month = oct,
year = "2016",
DOI = "https://doi.org/10.1145/3022671.2984025",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:13 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Verifying concurrent programs is challenging due to
the exponentially large thread interleaving space. The
problem is exacerbated by relaxed memory models such as
Total Store Order (TSO) and Partial Store Order (PSO)
which further explode the interleaving space by
reordering instructions. A recent advance, Maximal
Causality Reduction (MCR), has shown great promise to
improve verification effectiveness by maximally
reducing redundant explorations. However, the original
MCR only works for the Sequential Consistency (SC)
memory model, but not for TSO and PSO. In this paper,
we develop novel extensions to MCR by solving two key
problems under TSO and PSO: (1) generating
interleavings that can reach new states by encoding the
operational semantics of TSO and PSO with first-order
logical constraints and solving them with SMT solvers,
and (2) enforcing TSO and PSO interleavings by
developing novel replay algorithms that allow
executions out of the program order. We show that our
approach successfully enables MCR to effectively
explore TSO and PSO interleavings. We have compared our
approach with a recent Dynamic Partial Order Reduction
(DPOR) algorithm for TSO and PSO and a SAT-based
stateless model checking approach. Our results show
that our approach is much more effective than the other
approaches for both state-space exploration and bug
finding --- on average it explores 5-10X fewer
executions and finds many bugs that the other tools
cannot find.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '16 conference proceedings.",
author = "Jeff Huang and Arun K. Rajagopalan",
title = "Precise and maximal race detection from incomplete
journal = j-SIGPLAN,
volume = "51",
number = "10",
pages = "462--476",
month = oct,
year = "2016",
DOI = "https://doi.org/10.1145/3022671.2984024",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:13 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present RDIT, a novel dynamic technique to detect
data races in multithreaded programs with incomplete
trace information, i.e., in the presence of missing
events. RDIT is both precise and maximal: it does not
report any false alarms and it detects a maximal set of
true traces from the observed incomplete trace. RDIT is
underpinned by a sound BarrierPair model that abstracts
away the missing events by capturing the invocation
data of their enclosing methods. By making the least
conservative abstraction that a missing method
introduces synchronization only when it has a memory
address in scope that overlaps with other events or
other missing methods, and by formulating maximal
thread causality as logical constraints, RDIT
guarantees to precisely detect races with maximal
capability. RDIT has been applied in seven real-world
large concurrent systems and has detected dozens of
true races with zero false alarms. Comparatively,
existing algorithms such as Happens-Before,
Causal-Precedes, and Maximal-Causality which are known
to be precise all report many false alarms when missing
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '16 conference proceedings.",
author = "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and
Hai Jin and Xiaofei Liao and Chengzhong Xu",
title = "Two-Level Hybrid Sampled Simulation of Multithreaded
journal = j-TACO,
volume = "12",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818353",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Sampled microarchitectural simulation of
single-threaded applications is mature technology for
over a decade now. Sampling multithreaded applications,
on the other hand, is much more complicated. Not until
very recently have researchers proposed solutions for
sampled simulation of multithreaded applications.
Time-Based Sampling (TBS) samples multithreaded
application execution based on time---not instructions
as is typically done for single-threaded
applications---yielding estimates for a multithreaded
application's execution time. In this article, we
revisit and analyze previously proposed TBS approaches
(periodic and cantor fractal based sampling), and we
obtain a number of novel and surprising insights, such
as (i) accurately estimating fast-forwarding IPC, that
is, performance in-between sampling units, is more
important than accurately estimating sample IPC, that
is, performance within the sampling units; (ii)
fast-forwarding IPC estimation accuracy is determined
by both the sampling unit distribution and how to use
the sampling units to predict fast-forwarding IPC; and
(iii) cantor sampling is more accurate at small
sampling unit sizes, whereas periodic is more accurate
at large sampling unit sizes. These insights lead to
the development of Two-level Hybrid Sampling (THS), a
novel sampling methodology for multithreaded
applications that combines periodic sampling's accuracy
at large time scales (i.e., uniformly selecting
coarse-grain sampling units across the entire program
execution) with cantor sampling's accuracy at small
time scales (i.e., the ability to accurately predict
fast-forwarding IPC in-between small sampling units).
The clustered occurrence of small sampling units under
cantor sampling also enables shortened warmup and thus
enhanced simulation speed. Overall, THS achieves an
average absolute execution time prediction error of 4\%
while yielding an average simulation speedup of 40 $
\times $ compared to detailed simulation, which is both
more accurate and faster than the current
state-of-the-art. Case studies illustrate THS' ability
to accurately predict relative performance differences
across the design space.",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Daejin Jung and Sheng Li and Jung Ho Ahn",
title = "Large Pages on Steroids: Small Ideas to Accelerate Big
Memory Applications",
volume = "15",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2495103",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Utilizing small (e.g., 4 KB) pages incurs frequent TLB
misses on modern big memory applications, substantially
degrading the performance of the system. Large (e.g., 1
GB) pages or direct segments can alleviate this penalty
due to page table walks, but at the same time such a
strategy exposes the organizational and operational
details of modern DRAM-based memory systems to
applications. Row-buffer conflicts caused by accesses
heading to the same DRAM bank but different rows from
multiple threads are regarded as the main culprits
behind the very large gaps between peak and achieved
main memory throughput, but hardware-based approaches
in memory controllers have achieved only limited
success whereas existing proposals that change memory
allocators cannot be applied to large pages or direct
segments. In this paper, we propose a set of
application-level techniques to improve the effective
main memory bandwidth. The techniques stem from the two
key observations that (1) each thread of an application
exclusively accesses certain datasets for a short or
long period of time, and (2) superfluous memory reads
originating from a cache's write allocation policy can
be avoided if scatters during the data shuffling pass
through intermediate cache-friendly buffers.
Experiments with a contemporary x86 server show that
combining large pages with the proposed address
linearization, bank coloring, and write streaming
techniques improves the performance of the three big
memory applications of high-throughput key-value store,
fast-Fourier transform, and radix sort by 37.6, 22.9,
and 68.1 percent, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Jung, D (Reprint Author), Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Jung,
Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Li,
Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho,
Seoul Natl Univ, Big Data Inst, Seoul, South Korea.",
author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Research Foundation of Korea -
Korea government [NRF-2014R1A2A1A11052936,
funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on
their contributions to application writing and
experiments. This work was partially supported by the
National Research Foundation of Korea grant funded by
the Korea government (NRF-2014R1A2A1A11052936 and
NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big
Data Institute, Seoul National University.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address linearization; application-level techniques;
Bandwidth; bank coloring; big memory applications;
cache storage; cache write allocation policy;
cache-friendly buffers; data shuffling; DRAM bank; DRAM
chips; DRAM-based memory; fast-Fourier transform;
high-throughput key-value store; Instruction sets;
large pages; memory allocators; memory bandwidth;
memory controllers; Memory management; memory
throughput; multi-threading; multiple threads;
Performance gain; Physical-to-DRAM address mapping;
radix sort; Random access memory; row-buffer conflicts;
Servers; superfluous memory reads; write streaming",
number-of-cited-references = "14",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Jung:2016:LPS",
web-of-science-categories = "Computer Science, Hardware \&
author = "Rajshekar Kalayappan and Smruti R. Sarangi",
title = "{FluidCheck}: a Redundant Threading-Based Approach for
Reliable Execution in Manycore Processors",
journal = j-TACO,
volume = "12",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842620",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Soft errors have become a serious cause of concern
with reducing feature sizes. The ability to accommodate
complex, Simultaneous Multithreading (SMT) cores on a
single chip presents a unique opportunity to achieve
reliable execution, safe from soft errors, with low
performance penalties. In this context, we present
FluidCheck, a checker architecture that allows highly
flexible assignment and migration of checking duties
across cores. In this article, we present a mechanism
to dynamically use the resources of SMT cores for
checking the results of other threads, and propose a
variety of heuristics for migration of such checker
threads across cores. Secondly, to make the process of
checking more efficient, we propose a set of
architectural enhancements that reduce power
consumption, decrease the length of the critical path,
and reduce the load on the Network-on-Chip (NoC). Based
on our observations, we design a 16 core system for
running SPEC2006 based bag-of-tasks applications. Our
experiments demonstrate that fully reliable execution
can be attained with a mere 27\% slowdown, surpassing
traditional redundant threading based techniques by
roughly 42\%.",
acknowledgement = ack-nhfb,
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Youngho Kim and Joong Chae Na and Heejin Park and
Jeong Seop Sim",
title = "A space-efficient alphabet-independent
{Four-Russians}' lookup table and a multithreaded
{Four-Russians}' edit distance algorithm",
journal = j-THEOR-COMP-SCI,
volume = "656 (Part B)",
number = "??",
pages = "173--179",
day = "20",
month = dec,
year = "2016",
ISSN = "0304-3975 (print), 1879-2294 (electronic)",
ISSN-L = "0304-3975",
bibdate = "Fri Dec 9 12:17:02 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0304397516300676",
acknowledgement = ack-nhfb,
fjournal = "Theoretical Computer Science",
journal-URL = "http://www.sciencedirect.com/science/journal/03043975/",
author = "Takuro Kutsuna and Yoshinao Ishii",
title = "Abstraction and refinement of mathematical functions
toward {SMT}-based test-case generation",
volume = "18",
number = "1",
pages = "109--120",
month = feb,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1007/s10009-015-0389-7",
ISSN = "1433-2779 (print), 1433-2787 (electronic)",
ISSN-L = "1433-2779",
bibdate = "Mon Jan 25 08:12:53 MST 2016",
bibsource = "http://link.springer.com/journal/10009/18/1;
URL = "http://link.springer.com/article/10.1007/s10009-015-0389-7",
acknowledgement = ack-nhfb,
fjournal = "International Journal on Software Tools for Technology
Transfer (STTT)",
journal-URL = "http://link.springer.com/journal/10009",
author = "Bo-Cheng Charles Lai and Luis Garrido Platero and
Hsien-Kai Kuo",
title = "A Quantitative Method to Data Reuse Patterns of {SIMT}
volume = "15",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2491279",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Understanding data reuse patterns of a computing
system is crucial to effective design optimization. The
emerging Single Instruction Multiple Threads (SIMT)
processor adopts a programming model that is
fundamentally disparate from conventional scalar
processors. There is a lack of analytical approaches to
quantify the data reuse of SIMT applications. This
paper presents a quantitative method to study the data
reuse inherent to SIMT applications. A metric, Data
Reuse Degree, is defined to measure the amount of
reused data between memory references, and associate
each data reuse degree to a temporal distance
representing the virtual time of the execution process.
The experiments are performed on an abstracted SIMT
processor that considers the programming model and
runtime specifics. The experiments illustrate diverse
data reuse patterns of SIMT applications and explore
the impacts of architectural limitations.",
acknowledgement = ack-nhfb,
affiliation = "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept
Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles,
Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300,
Taiwan. Platero, Luis Garrido, Barcelona Super Comp
Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc,
Hsinchu, Taiwan.",
author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MOST [104-2221-E-009-079]",
funding-text = "This project was supported by MOST grant
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural limitations; cache memory; Cache memory;
computing system; data analysis; data reuse degree;
data reuse patterns; design optimization; execution
process; Graphics processing units; Instruction sets;
Measurement; Memory management; multi-threading;
Parallel architectures; Parallel architectures, cache
memory, parallel processing; parallel processing;
Parallel processing; programming model; scalar
processors; SIMT applications; SIMT processors;
single-instruction multiple-threads processors; virtual
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lai:2016:QMD",
web-of-science-categories = "Computer Science, Hardware \&
author = "Jing Li and Hung-Wei Tseng and Chunbin Lin and Yannis
Papakonstantinou and Steven Swanson",
title = "{HippogriffDB}: balancing {I/O} and {GPU} bandwidth in
big data analytics",
volume = "9",
number = "14",
pages = "1647--1658",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "As data sets grow and conventional processor
performance scaling slows, data analytics move towards
heterogeneous architectures that incorporate hardware
accelerators (notably GPUs) to continue scaling
performance. However, existing GPU-based databases fail
to deal with big data applications efficiently: their
execution model suffers from scalability limitations on
GPUs whose memory capacity is limited; existing systems
fail to consider the discrepancy between fast GPUs and
slow storage, which can counteract the benefit of GPU
accelerators. In this paper, we propose HippogriffDB,
an efficient, scalable GPU-accelerated OLAP system. It
tackles the bandwidth discrepancy using compression and
an optimized data transfer path. HippogriffDB stores
tables in a compressed format and uses the GPU for
decompression, trading GPU cycles for the improved I/O
bandwidth. To improve the data transfer efficiency,
HippogriffDB introduces a peer-to-peer, multi-threaded
data transfer mechanism, directly transferring data
from the SSD to the GPU. HippogriffDB adopts a
query-over-block execution model that provides
scalability using a stream-based approach. The model
improves kernel efficiency with the operator fusion and
double buffering mechanism. We have implemented
HippogriffDB using an NVMe SSD, which talks directly to
a commercial GPU. Results on two popular benchmarks
demonstrate its scalability and efficiency.
HippogriffDB outperforms existing GPU-based databases
(YDB) and in-memory data analytics (MonetDB) by 1-2
orders of magnitude.",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Cheng Ling and Tsuyoshi Hamada and Jingyang Gao and
Guoguang Zhao and Donghong Sun and Weifeng Shi",
title = "{MrBayes tgMC 3++}: a High Performance and
Resource-Efficient {GPU}-Oriented Phylogenetic Analysis
journal = j-TCBB,
volume = "13",
number = "5",
pages = "845--854",
month = sep,
year = "2016",
DOI = "https://doi.org/10.1109/TCBB.2015.2495202",
ISSN = "1545-5963 (print), 1557-9964 (electronic)",
ISSN-L = "1545-5963",
bibdate = "Fri Dec 30 16:19:30 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "MrBayes is a widespread phylogenetic inference tool
harnessing empirical evolutionary models and Bayesian
statistics. However, the computational cost on the
likelihood estimation is very expensive, resulting in
undesirably long execution time. Although a number of
multi-threaded optimizations have been proposed to
speed up MrBayes, there are bottlenecks that severely
limit the GPU thread-level parallelism of likelihood
estimations. This study proposes a high performance and
resource-efficient method for GPU-oriented
parallelization of likelihood estimations. Instead of
having to rely on empirical programming, the proposed
novel decomposition storage model implements high
performance data transfers implicitly. In terms of
performance improvement, a speedup factor of up to 178
can be achieved on the analysis of simulated datasets
by four Tesla K40 cards. In comparison to the other
publicly available GPU-oriented MrBayes, the tgMC$^3$
++ method proposed herein outperforms the tgMC$^3$
v1.0, nMC$^3$ v2.1.1 and oMC$^3$ v1.00 methods by
speedup factors of up to 1.6, 1.9 and 2.9,
respectively. Moreover, tgMC$^3$ ++ supports more
evolutionary models and gamma categories, which
previous GPU-oriented methods fail to take into
acknowledgement = ack-nhfb,
fjournal = "IEEE/ACM Transactions on Computational Biology and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954",
author = "Yongchao Liu and Thomas Hankeln and Bertil Schmidt",
title = "Parallel and space-efficient construction of
{Burrows--Wheeler} transform and suffix array for big
genome data",
journal = j-TCBB,
volume = "13",
number = "3",
pages = "592--598",
month = may,
year = "2016",
ISSN = "1545-5963 (print), 1557-9964 (electronic)",
ISSN-L = "1545-5963",
bibdate = "Mon Aug 29 06:50:39 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Next-generation sequencing technologies have led to
the sequencing of more and more genomes, propelling
related research into the era of big data. In this
paper, we present ParaBWT, a parallelized
Burrows--Wheeler transform (BWT) and suffix array
construction algorithm for big genome data. In ParaBWT,
we have investigated a progressive construction
approach to constructing the BWT of single genome
sequences in linear space complexity, but with a small
constant factor. This approach has been further
parallelized using multi-threading based on a
master-slave coprocessing model. After gaining the BWT,
the suffix array is constructed in a memory-efficient
manner. The performance of ParaBWT has been evaluated
using two sequences generated from two human genome
assemblies: the Ensembl Homo sapiens assembly and the
human reference genome. Our performance comparison to
FMD-index and Bwt-disk reveals that on 12 CPU cores,
ParaBWT runs up to $ 2.2 \times $ faster than FMD-index
and up to $ 99.0 \times $ faster than Bwt-disk. BWT
construction algorithms for very long genomic sequences
are time consuming and (due to their incremental
nature) inherently difficult to parallelize. Thus,
their parallelization is challenging and even
relatively small speedups like the ones of our method
over FMD-index are of high importance to research.
ParaBWT is written in C++, and is freely available at
acknowledgement = ack-nhfb,
fjournal = "IEEE/ACM Transactions on Computational Biology and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954",
author = "Qixiao Liu and Miquel Moreto and Jaume Abella and
Francisco J. Cazorla and Daniel A. Jimenez and Mateo
title = "Sensible Energy Accounting with Abstract Metering for
Multicore Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2842616",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Chip multicore processors (CMPs) are the preferred
processing platform across different domains such as
data centers, real-time systems, and mobile devices. In
all those domains, energy is arguably the most
expensive resource in a computing system. Accurately
quantifying energy usage in a multicore environment
presents a challenge as well as an opportunity for
optimization. Standard metering approaches are not
capable of delivering consistent results with shared
resources, since the same task with the same inputs may
have different energy consumption based on the mix of
co-running tasks. However, it is reasonable for
data-center operators to charge on the basis of
estimated energy usage rather than time since energy is
more correlated with their actual cost. This article
introduces the concept of Sensible Energy Accounting
(SEA). For a task running in a multicore system, SEA
accurately estimates the energy the task would have
consumed running in isolation with a given fraction of
the CMP shared resources. We explain the potential
benefits of SEA in different domains and describe two
hardware techniques to implement it for a shared
last-level cache and on-core resources in SMT
processors. Moreover, with SEA, an energy-aware
scheduler can find a highly efficient on-chip resource
assignment, reducing by up to 39\% the total processor
energy for a 4-core system.",
acknowledgement = ack-nhfb,
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Peng Liu and Jiyang Yu and Michael C. Huang",
title = "Thread-Aware Adaptive Prefetcher on Multicore Systems:
Improving the Performance for Multithreaded Workloads",
journal = j-TACO,
volume = "13",
number = "1",
pages = "13:1--13:??",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2890505",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Apr 5 16:27:36 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Most processors employ hardware data prefetching
techniques to hide memory access latencies. However,
the prefetching requests from different threads on a
multicore processor can cause severe interference with
prefetching and/or demand requests of others. The data
prefetching can lead to significant performance
degradation due to shared resource contention on shared
memory multicore systems. This article proposes a
thread-aware data prefetching mechanism based on
low-overhead runtime information to tune prefetching
modes and aggressiveness, mitigating the resource
contention in the memory system. Our solution has three
new components: (1) a self-tuning prefetcher that uses
runtime feedback to dynamically adjust data prefetching
modes and arguments of each thread, (2) a filtering
mechanism that informs the hardware about which
prefetching request can cause shared data invalidation
and should be discarded, and (3) a limiter thread
acceleration mechanism to estimate and accelerate the
critical thread which has the longest completion time
in the parallel region of execution. On a set of
multithreaded parallel benchmarks, our thread-aware
data prefetching mechanism improves the overall
performance of 64-core system by 13\% over a multimode
prefetch baseline system with two-level cache
organization and conventional modified, exclusive,
shared, and invalid-based directory coherence protocol.
We compare our approach with the feedback directed
prefetching technique and find that it provides 9\%
performance improvement on multicore systems, while
saving the memory bandwidth consumption.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Jean-Pierre Lozi and Florian David and Ga{\"e}l Thomas
and Julia Lawall and Gilles Muller",
title = "Fast and Portable Locking for Multicore
journal = j-TOCS,
volume = "33",
number = "4",
pages = "13:1--13:??",
month = jan,
year = "2016",
DOI = "https://doi.org/10.1145/2845079",
ISSN = "0734-2071 (print), 1557-7333 (electronic)",
ISSN-L = "0734-2071",
bibdate = "Wed Jan 6 06:45:30 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tocs/;
abstract = "The scalability of multithreaded applications on
current multicore systems is hampered by the
performance of lock algorithms, due to the costs of
access contention and cache misses. The main
contribution presented in this article is a new locking
technique, Remote Core Locking (RCL), that aims to
accelerate the execution of critical sections in legacy
applications on multicore architectures. The idea of
RCL is to replace lock acquisitions by optimized remote
procedure calls to a dedicated server hardware thread.
RCL limits the performance collapse observed with other
lock algorithms when many threads try to acquire a lock
concurrently and removes the need to transfer
lock-protected shared data to the hardware thread
acquiring the lock, because such data can typically
remain in the server's cache. Other contributions
presented in this article include a profiler that
identifies the locks that are the bottlenecks in
multithreaded applications and that can thus benefit
from RCL, and a reengineering tool that transforms
POSIX lock acquisitions into RCL locks. Eighteen
applications were used to evaluate RCL: the nine
applications of the SPLASH-2 benchmark suite, the seven
applications of the Phoenix 2 benchmark suite,
Memcached, and Berkeley DB with a TPC-C client. Eight
of these applications are unable to scale because of
locks and benefit from RCL on an x86 machine with four
AMD Opteron processors and 48 hardware threads. By
using RCL instead of Linux POSIX locks, performance is
improved by up to 2.5 times on Memcached, and up to
11.6 times on Berkeley DB with the TPC-C client. On a
SPARC machine with two Sun Ultrasparc T2+ processors
and 128 hardware threads, three applications benefit
from RCL. In particular, performance is improved by up
to 1.3 times with respect to Solaris POSIX locks on
Memcached, and up to 7.9 times on Berkeley DB with the
TPC-C client.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Computer Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774",
author = "Yaojie Lu and Seyedamin Rooholamin and Sotirios G.
title = "Vector Coprocessor Virtualization for Simultaneous
journal = j-TECS,
volume = "15",
number = "3",
pages = "57:1--57:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2898364",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Jul 21 17:18:13 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Vector coprocessors (VPs), commonly being assigned
exclusively to a single thread/core, are not often
performance and energy efficient due to mismatches with
the vector needs of individual applications. We present
in this article an easy-to-implement VP virtualization
technique that, when applied, enables a multithreaded
VP to simultaneously execute multiple threads of
similar or arbitrary vector lengths to achieve improved
aggregate utilization. With a vector register file
(VRF) virtualization technique invented to dynamically
allocate physical vector registers to threads, our VP
virtualization approach improves programmer
productivity by providing at runtime a distinct
physical register name space to each competing thread,
thus eliminating the need to solve register-name
conflicts statically. We applied our virtualization
technique to a multithreaded VP and prototyped an
FPGA-based multicore processor system that supports VP
sharing as well as power gating for better energy
efficiency. Under the dynamic creation of disparate
threads, our benchmarking results show impressive VP
speedups of up to 333\% and total energy savings of up
to 37\% with proper thread scheduling and power gating
compared to a similar-sized system that allows VP
access to just one thread at a time.",
acknowledgement = ack-nhfb,
articleno = "57",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Nuno Machado and Daniel Quinta and Brandon Lucia and
Lu{\'\i}s Rodrigues",
title = "Concurrency Debugging with Differential Schedule
journal = j-TOSEM,
volume = "25",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2016",
DOI = "https://doi.org/10.1145/2885495",
ISSN = "1049-331X (print), 1557-7392 (electronic)",
ISSN-L = "1049-331X",
bibdate = "Mon May 16 16:22:08 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tosem/;
abstract = "We present Symbiosis: a concurrency debugging
technique based on novel differential schedule
projections (DSPs). A DSP shows the small set of memory
operations and dataflows responsible for a failure, as
well as a reordering of those elements that avoids the
failure. To build a DSP, Symbiosis first generates a
full, failing, multithreaded schedule via thread path
profiling and symbolic constraint solving. Symbiosis
selectively reorders events in the failing schedule to
produce a nonfailing, alternate schedule. A DSP reports
the ordering and dataflow differences between the
failing and nonfailing schedules. Our evaluation on
buggy real-world software and benchmarks shows that, in
practical time, Symbiosis generates DSPs that both
isolate the small fraction of event orders and
dataflows responsible for the failure and report which
event reorderings prevent failing. In our experiments,
DSPs contain 90\% fewer events and 96\% fewer dataflows
than the full failure-inducing schedules. We also
conducted a user study that shows that, by allowing
developers to focus on only a few events, DSPs reduce
the amount of time required to understand the bug's
root cause and find a valid fix.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Software Engineering and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790",
author = "Daniel Marino and Abhayendra Singh and Todd Millstein
and Madanlal Musuvathi and Satish Narayanasamy",
title = "{drf x}: an Understandable, High Performance, and
Flexible Memory Model for Concurrent Languages",
journal = j-TOPLAS,
volume = "38",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2016",
DOI = "https://doi.org/10.1145/2925988",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Oct 18 11:41:44 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "The most intuitive memory model for shared-memory
multi-threaded programming is sequential consistency
(SC), but it disallows the use of many compiler and
hardware optimizations and thus affects performance.
Data-race-free (DRF) models, such as the C++11 memory
model, guarantee SC execution for data-race-free
programs. But these models provide no guarantee at all
for racy programs, compromising the safety and
debuggability of such programs. To address the safety
issue, the Java memory model, which is also based on
the DRF model, provides a weak semantics for racy
executions. However, this semantics is subtle and
complex, making it difficult for programmers to reason
about their programs and for compiler writers to ensure
the correctness of compiler optimizations. We present
the drf x memory model, which is simple for programmers
to understand and use while still supporting many
common optimizations. We introduce a memory model (MM)
exception that can be signaled to halt execution. If a
program executes without throwing this exception, then
drf x guarantees that the execution is SC. If a program
throws an MM exception during an execution, then drf x
guarantees that the program has a data race. We observe
that SC violations can be detected in hardware through
a lightweight form of conflict detection. Furthermore,
our model safely allows aggressive compiler and
hardware optimizations within compiler-designated
program regions. We formalize our memory model, prove
several properties of this model, describe a compiler
and hardware design suitable for drf x, and evaluate
the performance overhead due to our compiler and
hardware requirements.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Martin Marinov and Nicholas Nash and David Gregg",
title = "Practical Algorithms for Finding Extremal Sets",
volume = "21",
number = "1",
pages = "1.9:1--1.9:??",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2893184",
ISSN = "1084-6654",
bibdate = "Fri Nov 4 16:46:55 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jea.bib;
abstract = "The minimal sets within a collection of sets are
defined as the ones that do not have a proper subset
within the collection, and the maximal sets are the
ones that do not have a proper superset within the
collection. Identifying extremal sets is a fundamental
problem with a wide range of applications in SAT
solvers, data mining, and social network analysis. In
this article, we present two novel improvements of the
high-quality extremal set identification algorithm,
AMS-Lex, described by Bayardo and Panda. The first
technique uses memoization to improve the execution
time of the single-threaded variant of the AMS-Lex,
while our second improvement uses parallel programming
methods. In a subset of the presented experiments, our
memoized algorithm executes more than 400 times faster
than the highly efficient publicly available
implementation of AMS-Lex. Moreover, we show that our
modified algorithm's speedup is not bounded above by a
constant and that it increases as the length of the
common prefixes in successive input itemsets increases.
We provide experimental results using both real-world
and synthetic datasets, and show our multithreaded
variant algorithm outperforming AMS-Lex by 3 to 6
times. We find that on synthetic input datasets, when
executed using 16 CPU cores of a 32-core machine, our
multithreaded program executes about as fast as the
state-of-the-art parallel GPU-based program using an
NVIDIA GTX 580 graphics processing unit.",
acknowledgement = ack-nhfb,
articleno = "1.9",
fjournal = "Journal of Experimental Algorithmics (JEA)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J430",
author = "Kenneth Moreland and Christopher Sewell and William
Usher and Li-ta Lo and Jeremy Meredith and David
Pugmire and James Kress and Hendrik Schroots and
Kwan-Liu Ma and Hank Childs and Matthew Larsen and
Chun-Ming Chen and Robert Maynard and Berk Geveci",
title = "{VTK-m}: Accelerating the Visualization Toolkit for
Massively Threaded Architectures",
journal = j-IEEE-CGA,
volume = "36",
number = "3",
pages = "48--58",
month = may # "\slash " # jun,
year = "2016",
ISSN = "0272-1716 (print), 1558-1756 (electronic)",
ISSN-L = "0272-1716",
bibdate = "Wed Oct 5 07:24:20 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecga.bib;
URL = "https://www.computer.org/csdl/mags/cg/2016/03/mcg2016030048-abs.html",
acknowledgement = ack-nhfb,
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=38",
author = "Ganesh Narayanaswamy and Saurabh Joshi and Daniel
title = "The virtues of conflict: analysing modern
journal = j-SIGPLAN,
volume = "51",
number = "8",
pages = "25:1--25:??",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1145/3016078.2851165",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Modern shared memory multiprocessors permit reordering
of memory operations for performance reasons. These
reorderings are often a source of subtle bugs in
programs written for such architectures. Traditional
approaches to verify weak memory programs often rely on
interleaving semantics, which is prone to state space
explosion, and thus severely limits the scalability of
the analysis. In recent times, there has been a renewed
interest in modelling dynamic executions of weak memory
programs using partial orders. However, such an
approach typically requires ad-hoc mechanisms to
correctly capture the data and control-flow
choices/conflicts present in real-world programs. In
this work, we propose a novel, conflict-aware,
composable, truly concurrent semantics for programs
written using C/C++ for modern weak memory
architectures. We exploit our symbolic semantics based
on general event structures to build an efficient
decision procedure that detects assertion violations in
bounded multi-threaded programs. Using a large,
representative set of benchmarks, we show that our
conflict-aware semantics outperforms the
state-of-the-art partial-order based approaches.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '16 conference proceedings.",
author = "David Nogueira and Pedro Tomas and Nuno Roma",
title = "{BowMapCL}: {Burrows--Wheeler} Mapping on Multiple
Heterogeneous Accelerators",
journal = j-TCBB,
volume = "13",
number = "5",
pages = "926--938",
month = sep,
year = "2016",
DOI = "https://doi.org/10.1109/TCBB.2015.2495149",
ISSN = "1545-5963 (print), 1557-9964 (electronic)",
ISSN-L = "1545-5963",
bibdate = "Fri Dec 30 16:19:30 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The computational demand of exact-search procedures
has pressed the exploitation of parallel processing
accelerators to reduce the execution time of many
applications. However, this often imposes strict
restrictions in terms of the problem size and
implementation efforts, mainly due to their possibly
distinct architectures. To circumvent this limitation,
a new exact-search alignment tool BowMapCL based on the
Burrows--Wheeler Transform and FM-Index is presented.
Contrasting to other alternatives, BowMapCL is based on
a unified implementation using OpenCL, allowing the
exploitation of multiple and possibly different devices
e.g., NVIDIA, AMD/ATI, and Intel GPUs/APUs.
Furthermore, to efficiently exploit such heterogeneous
architectures, BowMapCL incorporates several techniques
to promote its performance and scalability, including
multiple buffering, work-queue task-distribution, and
dynamic load-balancing, together with index
partitioning, bit-encoding, and sampling. When compared
with state-of-the-art tools, the attained results
showed that BowMapCL using a single GPU is $ 2 \times $
to $ 7.5 \times $ faster than mainstream multi-threaded
CPU BWT-based aligners, like Bowtie, BWA, and SOAP2;
and up to $ 4 \times $ faster than the best performing
state-of-the-art GPU implementations namely, SOAP3 and
HPG-BWT. When multiple and completely distinct devices
are considered, BowMapCL efficiently scales the offered
throughput, ensuring a convenient load-balance of the
involved processing in the several distinct devices.",
acknowledgement = ack-nhfb,
fjournal = "IEEE/ACM Transactions on Computational Biology and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954",
author = "Brian Norris and Brian Demsky",
title = "A Practical Approach for Model Checking {C\slash
C++11} Code",
journal = j-TOPLAS,
volume = "38",
number = "3",
pages = "10:1--10:??",
month = may,
year = "2016",
DOI = "https://doi.org/10.1145/2806886",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Mon May 2 16:24:58 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Writing low-level concurrent software has
traditionally required intimate knowledge of the entire
toolchain and often has involved coding in assembly.
New language standards have extended C and C++ with
support for low-level atomic operations and a weak
memory model, enabling developers to write portable and
efficient multithreaded code. In this article, we
present CDSChecker, a tool for exhaustively exploring
the behaviors of concurrent code under the C/C++ memory
model. We have used CDSChecker to exhaustively unit
test concurrent data structure implementations and have
discovered errors in a published implementation of a
work-stealing queue and a single producer, single
consumer queue.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Stavros Papadopoulos and Kushal Datta and Samuel
Madden and Timothy Mattson",
title = "The {TileDB} array data storage manager",
volume = "10",
number = "4",
pages = "349--360",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025117",
ISSN = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present a novel storage manager for
multi-dimensional arrays that arise in scientific
applications, which is part of a larger scientific data
management system called TileDB. In contrast to
existing solutions, TileDB is optimized for both dense
and sparse arrays. Its key idea is to organize array
elements into ordered collections called fragments.
Each fragment is dense or sparse, and groups contiguous
array elements into data tiles of fixed capacity. The
organization into fragments turns random writes into
sequential writes, and, coupled with a novel read
algorithm, leads to very efficient reads. TileDB
enables parallelization via multi-threading and
multi-processing, offering thread-/process-safety and
atomicity via lightweight locking. We show that TileDB
delivers comparable performance to the HDF5 dense array
storage manager, while providing much faster random
writes. We also show that TileDB offers substantially
faster reads and writes than the SciDB array database
system with both dense and sparse arrays. Finally, we
demonstrate that TileDB is considerably faster than
adaptations of the Vertica relational column-store for
dense array storage management, and at least as fast
for the case of sparse arrays.",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon",
title = "Concurrent {JavaScript} Parsing for Faster Loading of
{Web} Apps",
journal = j-TACO,
volume = "13",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3004281",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "JavaScript is a dynamic language mainly used as a
client-side web script. Nowadays, web is evolving into
an application platform with its web apps, and
JavaScript increasingly undertakes complex computations
and interactive user interfaces, requiring a
high-performance JavaScript engine. There have been
many optimizations for efficient JavaScript engines,
but one component that has not been optimized much is
JavaScript parsing. A JavaScript function needs to be
parsed before being executed, and the parsing overhead
takes a substantial portion of JavaScript execution
time for web apps, especially during app loading. This
article proposes concurrent parsing of JavaScript,
which performs the parsing of JavaScript functions in
advance on different threads, while the main thread is
executing the parsed JavaScript functions. This can
hide the parsing overhead from the main execution
thread, reducing the JavaScript execution time, thus
reducing the overall app loading time. More
specifically, we separated JavaScript parsing and made
it run on different threads without violating the
execution semantics of JavaScript. We also designed an
efficient multi-threaded parsing architecture, which
reduces the synchronization overhead and schedules the
parsing requests appropriately. Finally, we explored
two methods of choosing the target functions for
concurrent parsing: one based on profiled information
and the other based on speculative heuristics. We
performed experiments on the WebKit browser with the
JSC engine for real web apps. The result shows that the
proposed concurrent parsing can improve the JavaScript
performance during app loading by as much as 64\% and
by 39.7\% on average. This improves the whole app
loading performance tangibly, by as much as 32.7\% and
by 18.2\%, on average.",
acknowledgement = ack-nhfb,
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
title = "{Tumbler}: an Effective Load-Balancing Technique for
Multi-{CPU} Multicore Systems",
journal = j-TACO,
volume = "12",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2827698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Feb 16 15:36:38 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Schedulers used by modern OSs (e.g., Oracle Solaris
11{\trademark} and GNU/Linux) balance load by balancing
the number of threads in run queues of different cores.
While this approach is effective for a single CPU
multicore system, we show that it can lead to a
significant load imbalance across CPUs of a multi-CPU
multicore system. Because different threads of a
multithreaded application often exhibit different
levels of CPU utilization, load cannot be measured in
terms of the number of threads alone. We propose
Tumbler that migrates the threads of a multithreaded
program across multiple CPUs to balance the load across
the CPUs. While Tumbler distributes the threads equally
across the CPUs, its assignment of threads to CPUs is
aimed at minimizing the variation in utilization of
different CPUs to achieve load balance. We evaluated
Tumbler using a wide variety of 35 multithreaded
applications, and our experimental results show that
Tumbler outperforms both Oracle Solaris 11{\trademark}
and GNU/Linux.",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Junjie Qian and Witawas Srisa-an and Sharad Seth and
Hong Jiang and Du Li and Pan Yi",
title = "Exploiting {FIFO} Scheduler to Improve Parallel
Garbage Collection Performance",
journal = j-SIGPLAN,
volume = "51",
number = "7",
pages = "109--121",
month = jul,
year = "2016",
DOI = "https://doi.org/10.1145/3007611.2892248",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Recent studies have found that parallel garbage
collection performs worse with more CPUs and more
collector threads. As part of this work, we further
investigate this phenomenon and find that poor
scalability is worst in highly scalable Java
applications. Our investigation to find the causes
clearly reveals that efficient multi-threading in an
application can prolong the average object lifespan,
which results in less effective garbage collection. We
also find that prolonging lifespan is the direct result
of Linux's Completely Fair Scheduler due to its
round-robin like behavior that can increase the heap
contention between the application threads. Instead, if
we use pseudo first-in-first-out to schedule
application threads in large multicore systems, the
garbage collection scalability is significantly
improved while the time spent in garbage collection is
reduced by as much as 21\%. The average execution time
of the 24 Java applications used in our study is also
reduced by 11\%. Based on this observation, we propose
two approaches to optimally select scheduling policies
based on application scalability profile. Our first
approach uses the profile information from one
execution to tune the subsequent executions. Our second
approach dynamically collects profile information and
performs policy selection during execution.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "VEE '16 conference proceedings.",
author = "Xuehai Qian and Koushik Sen and Paul Hargrove and
Costin Iancu",
title = "{OPR}: deterministic group replay for one-sided
journal = j-SIGPLAN,
volume = "51",
number = "8",
pages = "47:1--47:??",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1145/3016078.2851179",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The ability to reproduce a parallel execution is
desirable for debugging and program reliability
purposes. In debugging (13), the programmer needs to
manually step back in time, while for resilience (6)
this is automatically performed by the application upon
failure. To be useful, replay has to faithfully
reproduce the original execution. For parallel programs
the main challenge is inferring and maintaining the
order of conflicting operations (data races).
Deterministic record and replay (R{\&}R) techniques
have been developed for multithreaded shared memory
programs (5), as well as distributed memory programs
(14). Our main interest is techniques for large scale
scientific (3; 4) programming models.",
acknowledgement = ack-nhfb,
articleno = "47",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '16 conference proceedings.",
author = "P. Radojkovic and P. M. Carpenter and M. Moreto and V.
Cakarevic and J. Verdu and A. Pajuelo and F. J. Cazorla
and M. Nemirovsky and M. Valero",
title = "Thread Assignment in Multicore\slash Multithreaded
Processors: A Statistical Approach",
journal = j-IEEE-TRANS-COMPUT,
volume = "65",
number = "1",
pages = "256--269",
month = "????",
year = "2016",
DOI = "https://doi.org/10.1109/TC.2015.2417533",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Tue Dec 15 09:36:24 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Malavika Samak and Omer Tripp and Murali Krishna
title = "Directed synthesis of failing concurrent executions",
journal = j-SIGPLAN,
volume = "51",
number = "10",
pages = "430--446",
month = oct,
year = "2016",
DOI = "https://doi.org/10.1145/3022671.2984040",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:13 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "Detecting concurrency-induced bugs in multithreaded
libraries can be challenging due to the intricacies
associated with their manifestation. This includes
invocation of multiple methods, synthesis of inputs to
the methods to reach the failing location, and crafting
of thread interleavings that cause the erroneous
behavior. Neither fuzzing-based testing techniques nor
over-approximate static analyses are well positioned to
detect such subtle defects while retaining high
accuracy alongside satisfactory coverage. In this
paper, we propose a directed, iterative and scalable
testing engine that combines the strengths of static
and dynamic analysis to help synthesize concurrent
executions to expose complex concurrency-induced bugs.
Our engine accepts as input the library, its client
(either sequential or concurrent) and a specification
of correctness. Then, it iteratively refines the client
to generate an execution that can break the input
specification. Each step of the iterative process
includes statically identifying sub-goals towards the
goal of failing the specification, generating a plan
toward meeting these goals, and merging of the paths
traversed dynamically with the plan computed statically
via constraint solving to generate a new client. The
engine reports full reproduction scenarios, guaranteed
to be true, for the bugs it finds. We have created a
prototype of our approach named MINION. We validated
MINION by applying it to well-tested concurrent classes
from popular Java libraries, including the latest
versions of OpenJDK and Google-Guava. We were able to
detect 31 real crashes across 10 classes in a total of
23 minutes, including previously unknown bugs.
Comparison with three other tools reveals that
combined, they report only 9 of the 31 crashes (and no
other crashes beyond MINION). This is because several
of these bugs manifest under deeply nested path
conditions (observed maximum of 11), deep nesting of
method invocations (observed maximum of 6) and multiple
refinement iterations to generate the crash-inducing
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "OOPSLA '16 conference proceedings.",
author = "Faissal M. Sleiman and Thomas F. Wenisch",
title = "Efficiently scaling out-of-order cores for
simultaneous multithreading",
journal = j-COMP-ARCH-NEWS,
volume = "44",
number = "3",
pages = "431--443",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/3007787.3001183",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Jan 12 18:43:43 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Simultaneous multithreading (SMT) out-of-order cores
waste a significant portion of structural out-of-order
core resources on instructions that do not need them.
These resources eliminate false ordering dependences.
However, because thread interleaving spreads dependent
instructions, nearly half of instructions dynamically
issue in program order after all false dependences have
resolved. These in-sequence instructions interleave
with other reordered instructions at a fine granularity
within the instruction window. We develop a technique
to efficiently scale in-flight instructions through a
hybrid out-of-order/in-order microarchitecture, which
can dispatch instructions to efficient in-order
scheduling mechanisms---using a FIFO issue queue called
the shelf ---on an instruction-by-instruction basis.
Instructions dispatched to the shelf do not allocate
out-of-order core resources in the reorder buffer,
issue queue, physical registers, or load-store queues.
We measure opportunity for such hybrid
microarchitectures and design and evaluate a practical
dispatch mechanism targeted at 4-threaded cores. Adding
a shelf to a baseline 4-thread system with 64-entry ROB
improves normalized system throughput by 11.5\% (up to
19.2\% at best) and energy-delay product by 10.9\% (up
to 17.5\% at best).",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ISCA '16 conference proceedings.",
author = "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Ming
Fan and Eryue Zhuang and Zijiang Yang",
title = "Exploiting thread-related system calls for plagiarism
detection of multithreaded programs",
journal = j-J-SYST-SOFTW,
volume = "119",
number = "??",
pages = "136--148",
month = sep,
year = "2016",
ISSN = "0164-1212 (print), 1873-1228 (electronic)",
ISSN-L = "0164-1212",
bibdate = "Sat Jul 16 18:10:04 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0164121216300838",
acknowledgement = ack-nhfb,
fjournal = "Journal of Systems and Software",
journal-URL = "http://www.sciencedirect.com/science/journal/01641212/",
author = "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J.
Dias and Jo{\~a}o M. Louren{\c{c}}o",
title = "{Pot}: Deterministic Transactional Execution",
journal = j-TACO,
volume = "13",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3017993",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Dec 28 16:24:46 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This article presents Pot, a system that leverages the
concept of preordered transactions to achieve
deterministic multithreaded execution of programs that
use Transactional Memory. Preordered transactions
eliminate the root cause of nondeterminism in
transactional execution: they provide the illusion of
executing in a deterministic serial order, unlike
traditional transactions that appear to execute in a
nondeterministic order that can change from execution
to execution. Pot uses a new concurrency control
protocol that exploits the serialization order to
distinguish between fast and speculative transaction
execution modes in order to mitigate the overhead of
imposing a deterministic order. We build two Pot
prototypes: one using STM and another using
off-the-shelf HTM. To the best of our knowledge, Pot
enables deterministic execution of programs using
off-the-shelf HTM for the first time. An experimental
evaluation shows that Pot achieves deterministic
execution of TM programs with low overhead, sometimes
even outperforming nondeterministic executions, and
clearly outperforming the state of the art.",
acknowledgement = ack-nhfb,
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Field G. {Van Zee} and Tyler M. Smith and Bryan Marker
and Tze Meng Low and Robert A. {Van De Geijn} and
Francisco D. Igual and Mikhail Smelyanskiy and Xianyi
Zhang and Michael Kistler and Vernon Austel and John A.
Gunnels and Lee Killough",
title = "The {BLIS} Framework: Experiments in Portability",
journal = j-TOMS,
volume = "42",
number = "2",
pages = "12:1--12:19",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/2755561",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Fri Jun 3 18:52:21 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "BLIS is a new software framework for instantiating
high-performance BLAS-like dense linear algebra
libraries. We demonstrate how BLIS acts as a
productivity multiplier by using it to implement the
level-3 BLAS on a variety of current architectures. The
systems for which we demonstrate the framework include
state-of-the-art general-purpose, low-power, and
many-core architectures. We show, with very little
effort, how the BLIS framework yields sequential and
parallel implementations that are competitive with the
performance of ATLAS, OpenBLAS (an effort to maintain
and extend the GotoBLAS), and commercial vendor
implementations such as AMD's ACML, IBM's ESSL, and
Intel's MKL libraries. Although most of this article
focuses on single-core implementation, we also provide
compelling results that suggest the framework's
leverage extends to the multithreaded domain.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Javier Verdu and Alex Pajuelo",
title = "Performance Scalability Analysis of {JavaScript}
Applications with {Web Workers}",
volume = "15",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2494585",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Web applications are getting closer to the performance
of native applications taking advantage of new
standard-based technologies. The recent HTML5 standard
includes, among others, the Web Workers API that allows
executing JavaScript applications on multiple threads,
or workers. However, the internals of the browser's
JavaScript virtual machine does not expose direct
relation between workers and running threads in the
browser and the utilization of logical cores in the
processor. As a result, developers do not know how
performance actually scales on different environments
and therefore what is the optimal number of workers on
parallel JavaScript codes. This paper presents the
first performance scalability analysis of parallel web
apps with multiple workers. We focus on two case
studies representative of different worker execution
models. Our analyses show performance scaling on
different parallel processor microarchitectures and on
three major web browsers in the market. Besides, we
study the impact of co-running applications on the web
app performance. The results provide insights for
future approaches to automatically find out the optimal
number of workers that provide the best tradeoff
between performance and resource usage to preserve
system responsiveness and user experience, especially
on environments with unexpected changes on system
acknowledgement = ack-nhfb,
affiliation = "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept
Comp Architecture, Barcelona, Spain. Verdu, Javier;
Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp
Architecture, Barcelona, Spain.",
author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu",
da = "2019-06-20",
doc-delivery-number = "EH9MM",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Economy and
Competitiveness (MINECO) [TIN2012-34557]",
funding-text = "This work has been supported by the Spanish Ministry
of Economy and Competitiveness (MINECO) under contract
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "HTML5; javascript; multithreading; parallelism; web
apps; web workers",
number-of-cited-references = "12",
oa = "Green Published",
ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula,
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Verdu:2016:PSA",
web-of-science-categories = "Computer Science, Hardware \&
author = "Yuan Yao and Zhonghai Lu",
title = "Opportunistic competition overhead reduction for
expediting critical section in {NoC} based {CMPs}",
journal = j-COMP-ARCH-NEWS,
volume = "44",
number = "3",
pages = "279--290",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/3007787.3001167",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Thu Jan 12 18:43:43 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "With the degree of parallelism increasing, performance
of multi-threaded shared variable applications is not
only limited by serialized critical section execution,
but also by the serialized competition overhead for
threads to get access to critical section. As the
number of concurrent threads grows, such competition
overhead may exceed the time spent in critical section
itself, and become the dominating factor limiting the
performance of parallel applications. In modern
operating systems, queue spinlock, which comprises a
low-overhead spinning phase and a high-overhead
sleeping phase, is often used to lock critical
sections. In the paper, we show that this advanced
locking solution may create very high competition
overhead for multithreaded applications executing in
NoC-based CMPs. Then we propose a software-hardware
cooperative mechanism that can opportunistically
maximize the chance that a thread wins the critical
section access in the low-overhead spinning phase,
thereby reducing the competition overhead. At the OS
primitives level, we monitor the remaining times of
retry (RTR) in a thread's spinning phase, which
reflects in how long the thread must enter into the
high-overhead sleep mode. At the hardware level, we
integrate the RTR information into the packets of
locking requests, and let the NoC prioritize locking
request packets according to the RTR information. The
principle is that the smaller RTR a locking request
packet carries, the higher priority it gets and thus
quicker delivery. We evaluate our opportunistic
competition overhead reduction technique with
cycle-accurate full-system simulations in GEM5 using
PARSEC (11 programs) and SPEC OMP2012 (14 programs)
benchmarks. Compared to the original queue spinlock
implementation, experimental results show that our
method can effectively increase the opportunity of
threads entering the critical section in low-overhead
spinning phase, reducing the competition overhead
averagely by 39.9\% (maximally by 61.8\%) and
accelerating the execution of the Region-of-Interest
averagely by 14.4\% (maximally by 24.5\%) across all 25
benchmark programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
remark = "ISCA '16 conference proceedings.",
author = "Paraskevas Yiapanis and Gavin Brown and Mikel
title = "Compiler-Driven Software Speculation for Thread-Level
journal = j-TOPLAS,
volume = "38",
number = "2",
pages = "5:1--5:??",
month = jan,
year = "2016",
DOI = "https://doi.org/10.1145/2821505",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Tue Jan 5 16:31:06 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/toplas/;
abstract = "Current parallelizing compilers can tackle
applications exercising regular access patterns on
arrays or affine indices, where data dependencies can
be expressed in a linear form. Unfortunately, there are
cases that independence between statements of code
cannot be guaranteed and thus the compiler
conservatively produces sequential code. Programs that
involve extensive pointer use, irregular access
patterns, and loops with unknown number of iterations
are examples of such cases. This limits the extraction
of parallelism in cases where dependencies are rarely
or never triggered at runtime. Speculative parallelism
refers to methods employed during program execution
that aim to produce a valid parallel execution schedule
for programs immune to static parallelization. The
motivation for this article is to review recent
developments in the area of compiler-driven software
speculation for thread-level parallelism and how they
came about. The article is divided into two parts. In
the first part the fundamentals of speculative
parallelization for thread-level parallelism are
explained along with a design choice categorization for
implementing such systems. Design choices include the
ways speculative data is handled, how data dependence
violations are detected and resolved, how the correct
data are made visible to other threads, or how
speculative threads are scheduled. The second part is
structured around those design choices providing the
advances and trends in the literature with reference to
key developments in the area. Although the focus of the
article is in software speculative parallelization, a
section is dedicated for providing the interested
reader with pointers and references for exploring
similar topics such as hardware thread-level
speculation, transactional memory, and automatic
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Hairong Yu and Guohui Li and Jianjun Li and Lihchyun
title = "{DO$_{\rm cyclical}$}: a Latency-Resistant Cyclic
Multi-Threading Approach for Automatic Program
journal = j-COMP-J,
volume = "59",
number = "8",
pages = "1155--1173",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1093/comjnl/bxv125",
ISSN = "0010-4620 (print), 1460-2067 (electronic)",
ISSN-L = "0010-4620",
bibdate = "Tue Aug 30 07:10:50 MDT 2016",
bibsource = "http://comjnl.oxfordjournals.org/content/59/8.toc;
URL = "http://comjnl.oxfordjournals.org/content/59/8/1155",
acknowledgement = ack-nhfb,
fjournal = "Computer Journal",
journal-URL = "http://comjnl.oxfordjournals.org/",
onlinedate = "January 14, 2016",
author = "Mingzhe Zhang and Francis C. M. Lau and Cho-Li Wang
and Luwei Cheng and Haibo Chen",
title = "Scalable adaptive {NUMA}-aware lock: combining local
locking and remote locking for efficient concurrency",
journal = j-SIGPLAN,
volume = "51",
number = "8",
pages = "50:1--50:??",
month = aug,
year = "2016",
DOI = "https://doi.org/10.1145/3016078.2851176",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Scalable locking is a key building block for scalable
multi-threaded software. Its performance is especially
critical in multi-socket, multi-core machines with
non-uniform memory access (NUMA). Previous schemes such
as local locking and remote locking only perform well
under a certain level of contention, and often require
non-trivial tuning for a particular configuration.
Besides, for large NUMA systems, because of unmanaged
lock server's nomination, current distance-first NUMA
policies cannot perform satisfactorily. In this work,
we propose SANL, a locking scheme that can deliver high
performance under various contention levels by
adaptively switching between the local and the remote
lock scheme. Furthermore, we introduce a new NUMA
policy for the remote lock that jointly considers node
distances and server utilization when choosing lock
servers. A comparison with seven representative locking
schemes shows that SANL outperforms the others in most
contention situations. In one group test, SANL is 3.7
times faster than RCL lock and 17 times faster than
POSIX mutex.",
acknowledgement = ack-nhfb,
articleno = "50",
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '16 conference proceedings.",
author = "Tong Zhang and Dongyoon Lee and Changhee Jung",
title = "{TxRace}: Efficient Data Race Detection Using
Commodity Hardware Transactional Memory",
journal = j-OPER-SYS-REV,
volume = "50",
number = "2",
pages = "159--173",
month = jun,
year = "2016",
DOI = "https://doi.org/10.1145/2954680.2872384",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Thu Jun 9 17:03:34 MDT 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Detecting data races is important for debugging
shared-memory multithreaded programs, but the high
runtime overhead prevents the wide use of dynamic data
race detectors. This paper presents TxRace, a new
software data race detector that leverages commodity
hardware transactional memory (HTM) to speed up data
race detection. TxRace instruments a multithreaded
program to transform synchronization-free regions into
transactions, and exploits the conflict detection
mechanism of HTM for lightweight data race detection at
runtime. However, the limitations of the current
best-effort commodity HTMs expose several challenges in
using them for data race detection: (1) lack of ability
to pinpoint racy instructions, (2) false positives
caused by cache line granularity of conflict detection,
and (3) transactional aborts for non-conflict reasons
(e.g., capacity or unknown). To overcome these
challenges, TxRace performs lightweight HTM-based data
race detection at first, and occasionally switches to
slow yet precise data race detection only for the small
fraction of execution intervals in which potential
races are reported by HTM. According to the
experimental results, TxRace reduces the average
runtime overhead of dynamic data race detection from
11.68x to 4.65x with only a small number of false
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597",
author = "Miguel Areias and Ricardo Rocha",
title = "On scaling dynamic programming problems with a
multithreaded tabling {Prolog} system",
journal = j-J-SYST-SOFTW,
volume = "125",
number = "??",
pages = "417--426",
month = mar,
year = "2017",
ISSN = "0164-1212 (print), 1873-1228 (electronic)",
ISSN-L = "0164-1212",
bibdate = "Sat Feb 4 12:20:39 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib;
URL = "//www.sciencedirect.com/science/article/pii/S0164121216300929",
acknowledgement = ack-nhfb,
fjournal = "Journal of Systems and Software",
journal-URL = "http://www.sciencedirect.com/science/journal/01641212/",
author = "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R.
title = "Generating Fine-Grain Multithreaded Applications Using
a Multigrain Approach",
journal = j-TACO,
volume = "14",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155288",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The recent evolution in hardware landscape, aimed at
producing high-performance computing systems capable of
reaching extreme-scale performance, has reignited the
interest in fine-grain multithreading, particularly at
the intranode level. Indeed, popular parallel
programming environments, such as OpenMP, which
features a simple interface for the parallelization of
programs, are now incorporating fine-grain constructs.
However, since coarse-grain directives are still
heavily used, the OpenMP runtime is forced to support
both coarse- and fine-grain models of execution,
potentially reducing the advantages obtained when
executing an application in a fully fine-grain
environment. To evaluate the type of applications that
benefit from executing in a unified fine-grain program
execution model, this article presents a multigrain
parallel programming environment for the generation of
fine-grain multithreaded applications from programs
featuring OpenMP's API, allowing OpenMP programs to be
run on top of a fine-grain event-driven program
execution model. Experimental results with five
scientific benchmarks show that fine-grain
applications, generated by and run on our environment
with two runtimes implementing a fine-grain
event-driven program execution model, are competitive
and can outperform their OpenMP counterparts,
especially for data-intensive workloads with irregular
and dynamic parallelism, reaching speedups as high as
2.6$ \times $ for Graph500 and 51$ \times $ for NAS
Data Cube.",
acknowledgement = ack-nhfb,
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Michael A. Bender and Jonathan W. Berry and Simon D.
Hammond and K. Scott Hemmert and Samuel McCauley and
Branden Moore and Benjamin Moseley and Cynthia A.
Phillips and David Resnick and Arun Rodrigues",
title = "Two-level main memory co-design: Multi-threaded
algorithmic primitives, analysis, and simulation",
journal = j-J-PAR-DIST-COMP,
volume = "102",
number = "??",
pages = "213--228",
month = apr,
year = "2017",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Wed Jan 25 14:20:18 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S074373151630185X",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315/",
author = "Jim Blandy and Jason Orendorff",
title = "Programming {Rust}",
publisher = pub-ORA-MEDIA,
address = pub-ORA-MEDIA:adr,
pages = "xx + 598",
year = "2017",
ISBN = "1-4919-2728-3 (paperback), 1-4919-2727-5,
1-4919-2723-2 (e-book), 1-4919-2725-9 (e-book)",
ISBN-13 = "978-1-4919-2728-1 (paperback), 978-1-4919-2727-4,
978-1-4919-2723-6 (e-book), 978-1-4919-2725-0
LCCN = "QA76.73.R88 B53 2017",
bibdate = "Mon Dec 9 15:37:10 MST 2019",
bibsource = "fsz3950.oclc.org:210/WorldCat;
URL = "http://proquest.safaribooksonline.com/9781491927274",
abstract = "Rust is a new systems programming language that
combines the performance and low-level control of C and
C++ with memory safety and thread safety. Rust's
modern, flexible types ensure your program is free of
null pointer dereferences, double frees, dangling
pointers, and similar bugs, all at compile time,
without runtime overhead. In multithreaded code, Rust
catches data races at compile time, making concurrency
much easier to use. Written by two experienced systems
programmers, this book explains how Rust manages to
bridge the gap between performance and safety, and how
you can take advantage of it. Topics include: How Rust
represents values in memory (with diagrams) Complete
explanations of ownership, moves, borrows, and
lifetimes Cargo, rustdoc, unit tests, and how to
publish your code on crates.io, Rust's public package
repository High-level features like generic code,
closures, collections, and iterators that make Rust
productive and flexible Concurrency in Rust: threads,
mutexes, channels, and atomics, all much safer to use
than in C or C++ Unsafe code, and how to preserve the
integrity of ordinary code that uses it. Extended
examples illustrating how pieces of the language fit
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
subject = "UNIX (Computer file); UNIX (Computer file); C
(Computer program language); Text editors (Computer
programs); Software engineering; C (Computer program
language); Software engineering.; Text editors
(Computer programs)",
tableofcontents = "Preface \\
Who Should Read This Book \\
Why We Wrote This Book \\
Navigating This Book \\
Conventions Used in This Book \\
Using Code Examples \\
O Reilly Safari \\
How to Contact Us \\
Acknowledgments \\
1. Why Rust? \\
Type Safety \\
2. A Tour of Rust \\
Downloading and Installing Rust \\
A Simple Function \\
Writing and Running Unit Tests \\
Handling Command-Line Arguments \\
A Simple Web Server \\
Concurrency \\
What the Mandelbrot Set Actually Is \\
Parsing Pair Command-Line Arguments \\
Mapping from Pixels to Complex Numbers \\
Plotting the Set \\
Writing Image Files \\
A Concurrent Mandelbrot Program \\
Running the Mandelbrot Plotter \\
Safety Is Invisible \\
3. Basic Types \\
Machine Types \\
Integer Types \\
Floating-Point Types \\
The bool Type \\
Characters \\
Tuples \\
Pointer Types \\
References \\
Boxes \\
Raw Pointers \\
Arrays, Vectors, and Slices \\
Arrays \\
Vectors \\
Slices \\
String Types \\
String Literals \\
Byte Strings \\
Strings in Memory \\
String \\
Using Strings \\
Other String-Like Types \\
Beyond the Basics \\
4. Ownership \\
Ownership \\
Moves \\
More Operations That Move \\
Moves and Control Flow \\
Moves and Indexed Content \\
Copy Types: The Exception to Moves \\
Rc and Arc: Shared Ownership \\
5. References \\
References as Values \\
Rust References Versus C++ References \\
Assigning References \\
References to References \\
Comparing References \\
References Are Never Null \\
Borrowing References to Arbitrary Expressions \\
References to Slices and Trait Objects \\
Reference Safety \\
Borrowing a Local Variable \\
Receiving References as Parameters \\
Passing References as Arguments \\
Returning References \\
Structs Containing References \\
Distinct Lifetime Parameters \\
Omitting Lifetime Parameters \\
Sharing Versus Mutation \\
Taking Arms Against a Sea of Objects \\
6. Expressions \\
An Expression Language \\
Blocks and Semicolons \\
Declarations \\
if and match \\
if let \\
Loops \\
return Expressions \\
Why Rust Has loop \\
Function and Method Calls \\
Fields and Elements \\
Reference Operators \\
Arithmetic, Bitwise, Comparison, and Logical Operators
Assignment \\
Type Casts \\
Closures \\
Precedence and Associativity \\
Onward \\
7. Error Handling \\
Panic \\
Unwinding \\
Aborting \\
Result \\
Catching Errors \\
Result Type Aliases \\
Printing Errors \\
Propagating Errors \\
Working with Multiple Error Types \\
Dealing with Errors That Can t Happen \\
Ignoring Errors \\
Handling Errors in main() \\
Declaring a Custom Error Type \\
Why Results? \\
8. Crates and Modules \\
Crates \\
Build Profiles \\
Modules \\
Modules in Separate Files \\
Paths and Imports \\
The Standard Prelude \\
Items, the Building Blocks of Rust \\
Turning a Program into a Library \\
The src/bin Directory \\
Attributes \\
Tests and Documentation \\
Integration Tests \\
Documentation \\
Doc-Tests \\
Specifying Dependencies \\
Versions \\
Cargo.lock \\
Publishing Crates to crates.io \\
Workspaces \\
More Nice Things \\
9. Structs \\
Named-Field Structs \\
Tuple-Like Structs \\
Unit-Like Structs \\
Struct Layout \\
Defining Methods with impl \\
Generic Structs \\
Structs with Lifetime Parameters \\
Deriving Common Traits for Struct Types \\
Interior Mutability \\
10. Enums and Patterns \\
Enums \\
Enums with Data \\
Enums in Memory \\
Rich Data Structures Using Enums \\
Generic Enums \\
Patterns \\
Literals, Variables, and Wildcards in Patterns \\
Tuple and Struct Patterns \\
Reference Patterns \\
Matching Multiple Possibilities \\
Pattern Guards \\
@ patterns \\
Where Patterns Are Allowed \\
Populating a Binary Tree \\
The Big Picture \\
11. Traits and Generics \\
Using Traits \\
Trait Objects \\
Trait Object Layout \\
Generic Functions \\
Which to Use \\
Defining and Implementing Traits \\
Default Methods \\
Traits and Other People s Types \\
Self in Traits \\
Subtraits \\
Static Methods \\
Fully Qualified Method Calls \\
Traits That Define Relationships Between Types \\
Associated Types (or How Iterators Work) \\
Generic Traits (or How Operator Overloading Works) \\
Buddy Traits (or How rand::random() Works) \\
Reverse-Engineering Bounds \\
Conclusion \\
12. Operator Overloading \\
Arithmetic and Bitwise Operators \\
Unary Operators \\
Binary Operators \\
Compound Assignment Operators \\
Equality Tests \\
Ordered Comparisons \\
Index and IndexMut \\
Other Operators \\
13. Utility Traits \\
Drop \\
Sized \\
Clone \\
Copy \\
Deref and DerefMut \\
Default \\
AsRef and AsMut \\
Borrow and BorrowMut \\
From and Into \\
ToOwned \\
Borrow and ToOwned at Work: The Humble Cow \\
14. Closures \\
Capturing Variables \\
Closures That Borrow \\
Closures That Steal \\
Function and Closure Types \\
Closure Performance \\
Closures and Safety \\
Closures That Kill \\
FnOnce \\
FnMut \\
Callbacks \\
Using Closures Effectively \\
15. Iterators \\
The Iterator and IntoIterator Traits \\
Creating Iterators \\
iter and iter_mut Methods \\
IntoIterator Implementations \\
drain Methods \\
Other Iterator Sources \\
Iterator Adapters \\
map and filter \\
filter_map and flat_map \\
scan \\
take and take_while \\
skip and skip_while \\
peekable \\
fuse \\
Reversible Iterators and rev \\
inspect \\
chain \\
enumerate \\
zip \\
by_ref \\
cloned \\
cycle \\
Consuming Iterators \\
Simple Accumulation: count, sum, product \\
max, min \\
max_by, min_by \\
max_by_key, min_by_key \\
Comparing Item Sequences \\
any and all \\
position, rposition, and ExactSizeIterator \\
fold \\
nth \\
last \\
find \\
Building Collections: collect and FromIterator \\
The Extend Trait \\
partition \\
Implementing Your Own Iterators \\
16. Collections \\
Overview \\
Vec<T> \\
Accessing Elements \\
Iteration \\
Growing and Shrinking Vectors \\
Joining \\
Splitting \\
Swapping \\
Sorting and Searching \\
Comparing Slices \\
Random Elements \\
Rust Rules Out Invalidation Errors \\
VecDeque<T> \\
LinkedList<T> \\
BinaryHeap<T> \\
HashMap<K, V> and BTreeMap<K, V> \\
Entries \\
Map Iteration \\
HashSet<T> and BTreeSet<T> \\
Set Iteration \\
When Equal Values Are Different \\
Whole-Set Operations \\
Hashing \\
Using a Custom Hashing Algorithm \\
Beyond the Standard Collections \\
17. Strings and Text \\
Some Unicode Background \\
ASCII, Latin-1, and Unicode \\
UTF-8 \\
Text Directionality \\
Characters (char) \\
Classifying Characters \\
Handling Digits \\
Case Conversion for Characters \\
Conversions to and from Integers \\
String and str \\
Creating String Values \\
Simple Inspection \\
Appending and Inserting Text \\
Removing Text \\
Conventions for Searching and Iterating \\
Patterns for Searching Text \\
Searching and Replacing \\
Iterating over Text \\
Trimming \\
Case Conversion for Strings \\
Parsing Other Types from Strings \\
Converting Other Types to Strings \\
Borrowing as Other Text-Like Types \\
Accessing Text as UTF-8 \\
Producing Text from UTF-8 Data \\
Putting Off Allocation \\
Strings as Generic Collections \\
Formatting Values \\
Formatting Text Values \\
Formatting Numbers \\
Formatting Other Types \\
Formatting Values for Debugging \\
Formatting Pointers for Debugging \\
Referring to Arguments by Index or Name \\
Dynamic Widths and Precisions \\
Formatting Your Own Types \\
Using the Formatting Language in Your Own Code \\
Regular Expressions \\
Basic Regex Use \\
Building Regex Values Lazily \\
Normalization \\
Normalization Forms \\
The unicode-normalization Crate \\
18. Input and Output \\
Readers and Writers \\
Readers \\
Buffered Readers \\
Reading Lines \\
Collecting Lines \\
Writers \\
Files \\
Seeking \\
Other Reader and Writer Types \\
Binary Data, Compression, and Serialization \\
Files and Directories \\
OsStr and Path \\
Path and PathBuf Methods \\
Filesystem Access Functions \\
Reading Directories \\
Platform-Specific Features \\
Networking \\
19. Concurrency \\
Fork-Join Parallelism \\
spawn and join \\
Error Handling Across Threads \\
Sharing Immutable Data Across Threads \\
Rayon \\
Revisiting the Mandelbrot Set \\
Channels \\
Sending Values \\
Receiving Values \\
Running the Pipeline \\
Channel Features and Performance \\
Thread Safety: Send and Sync \\
Piping Almost Any Iterator to a Channel \\
Beyond Pipelines \\
Shared Mutable State \\
What Is a Mutex? \\
Mutex<T> \\
mut and Mutex \\
Why Mutexes Are Not Always a Good Idea \\
Deadlock \\
Poisoned Mutexes \\
Multi-Consumer Channels Using Mutexes \\
Read/Write Locks (RwLock<T>) \\
Condition Variables (Condvar) \\
Atomics \\
Global Variables \\
What Hacking Concurrent Code in Rust Is Like \\
20. Macros \\
Macro Basics \\
Basics of Macro Expansion \\
Unintended Consequences \\
Repetition \\
Built-In Macros \\
Debugging Macros \\
The json! Macro \\
Fragment Types \\
Recursion in Macros \\
Using Traits with Macros \\
Scoping and Hygiene \\
Importing and Exporting Macros \\
Avoiding Syntax Errors During Matching \\
Beyond macro_rules! \\
21. Unsafe Code \\
Unsafe from What? \\
Unsafe Blocks \\
Example: An Efficient ASCII String Type \\
Unsafe Functions \\
Unsafe Block or Unsafe Function? \\
Undefined Behavior \\
Unsafe Traits \\
Raw Pointers \\
Dereferencing Raw Pointers Safely \\
Example: RefWithFlag \\
Nullable Pointers \\
Type Sizes and Alignments \\
Pointer Arithmetic \\
Moving into and out of Memory \\
Example: GapBuffer \\
Panic Safety in Unsafe Code \\
Foreign Functions: Calling C and C++ from Rust \\
Finding Common Data Representations \\
Declaring Foreign Functions and Variables \\
Using Functions from Libraries \\
A Raw Interface to libgit2 \\
A Safe Interface to libgit2 \\
Conclusion \\
author = "Zvonimir Bujanovi{\'c} and Lars Karlsson and Daniel
title = "A {Householder}-based algorithm for
{Hessenberg}-triangular reduction",
journal = "arxiv.org",
volume = "??",
number = "??",
pages = "??--??",
day = "23",
month = oct,
year = "2017",
bibdate = "Fri Dec 21 10:00:58 2018",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/h/householder-alston-s.bib;
URL = "https://arxiv.org/abs/1710.08538",
abstract = "The QZ algorithm for computing eigenvalues and
eigenvectors of a matrix pencil A B requires that the
matrices first be reduced to Hessenberg-triangular (HT)
form. The current method of choice for HT reduction
relies entirely on Givens rotations regrouped and
accumulated into small dense matrices which are
subsequently applied using matrix multiplication
routines. A non-vanishing fraction of the total flop
count must nevertheless still be performed as sequences
of overlapping Givens rotations alternately applied
from the left and from the right. The many data
dependencies associated with this computational pattern
leads to inefficient use of the processor and poor
scalability. In this paper, we therefore introduce a
fundamentally different approach that relies entirely
on (large) Householder reflectors partially accumulated
into block reflectors, by using (compact) WY
representations. Even though the new algorithm requires
more floating point operations than the state of the
art algorithm, extensive experiments on both real and
synthetic data indicate that it is still competitive,
even in a sequential setting. The new algorithm is
conjectured to have better parallel scalability, an
idea which is partially supported by early small-scale
experiments using multi-threaded BLAS. The design and
evaluation of a parallel formulation is future work.",
acknowledgement = ack-nhfb,
author = "Man Cao and Minjia Zhang and Aritra Sengupta and
Swarnendu Biswas and Michael D. Bond",
title = "Hybridizing and Relaxing Dependence Tracking for
Efficient Parallel Runtime Support",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108138",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
abstract = "It is notoriously challenging to develop parallel
software systems that are both scalable and correct.
Runtime support for parallelism-such as multithreaded
record and replay, data race detectors, transactional
memory, and enforcement of stronger memory models-helps
achieve these goals, but existing commodity solutions
slow programs substantially to track (i.e., detect or
control) an execution's cross-thread dependencies
accurately. Prior work tracks cross-thread dependencies
either ``pessimistically,'' slowing every program
access, or ``optimistically,'' allowing for lightweight
instrumentation of most accesses but dramatically
slowing accesses that are conflicting (i.e., involved
in cross-thread dependencies). This article presents
two novel approaches that seek to improve the
performance of dependence tracking. Hybrid tracking
(HT) hybridizes pessimistic and optimistic tracking by
overcoming a fundamental mismatch between these two
kinds of tracking. HT uses an adaptive, profile-based
policy to make runtime decisions about switching
between pessimistic and optimistic tracking. Relaxed
tracking (RT) attempts to reduce optimistic tracking's
overhead on conflicting accesses by tracking
dependencies in a ``relaxed'' way-meaning that not all
dependencies are tracked accurately-while still
preserving both program semantics and runtime support's
correctness. To demonstrate the usefulness and
potential of HT and RT, we build runtime support based
on the two approaches. Our evaluation shows that both
approaches offer performance advantages over existing
approaches, but there exist challenges and
opportunities for further improvement. HT and RT are
distinct solutions to the same problem. It is easier to
build runtime support based on HT than on RT, although
RT does not incur the overhead of online profiling.
This article presents the two approaches together to
inform and inspire future designs for efficient
parallel runtime support.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Sandra Catal{\'a}n and Francisco D. Igual and Rafael
Mayo and Rafael Rodr{\'\i}guez-S{\'a}nchez and Enrique
S. Quintana-Ort{\'\i}",
title = "Time and energy modeling of a high-performance
multi-threaded {Cholesky} factorization",
volume = "73",
number = "1",
pages = "139--151",
month = jan,
year = "2017",
DOI = "https://doi.org/10.1007/s11227-016-1654-6",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Sat Jun 24 10:31:31 MDT 2017",
bibsource = "http://link.springer.com/journal/11227/73/1;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and
Chia-Lin Yang",
title = "Improving {GPGPU} Performance via Cache Locality Aware
Thread Block Scheduling",
volume = "16",
number = "2",
pages = "127--131",
month = jul # "\slash " # dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2693371",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Modern GPGPUs support the concurrent execution of
thousands of threads to provide an energy-efficient
platform. However, the massive multi-threading of
GPGPUs incurs serious cache contention, as the cache
lines brought by one thread can easily be evicted by
other threads in the small shared cache. In this paper,
we propose a software-hardware cooperative approach
that exploits the spatial locality among different
thread blocks to better utilize the precious cache
capacity. Through dynamic locality estimation and
thread block scheduling, we can capture more
performance improvement opportunities than prior work
that only explores the spatial locality between
consecutive thread blocks. Evaluations across diverse
GPGPU applications show that, on average, our
locality-aware scheduler provides 25 and 9 percent
performance improvement over the commonly-employed
round-robin scheduler and the state-of-the-art
scheduler, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei
10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang,
Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan.
Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.",
author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw
f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw",
da = "2019-06-20",
doc-delivery-number = "FR2AX",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ministry of Science and Technology of
Taiwan [MOST-105-2221-E-002-156-MY2,
MOST-105-2622-8-002-002, MOST-105-2218-E-002-025];
MediaTek Inc., Hsin-chu, Taiwan",
funding-text = "This work is supported in part by research grants from
the Ministry of Science and Technology of Taiwan
(MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002,
and MOST-105-2218-E-002-025), and sponsored by MediaTek
Inc., Hsin-chu, Taiwan.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache locality; GPGPU; thread block scheduling",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Chen:2017:IGP",
web-of-science-categories = "Computer Science, Hardware \&
author = "Huanqing Cui and Jian Niu and Chuanai Zhou and Minglei
title = "A Multi-Threading Algorithm to Detect and Remove
Cycles in Vertex- and Arc-Weighted Digraph",
volume = "10",
number = "4",
month = dec,
year = "2017",
DOI = "https://doi.org/10.3390/a10040115",
ISSN = "1999-4893 (electronic)",
ISSN-L = "1999-4893",
bibdate = "Fri May 3 13:50:13 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib;
URL = "https://www.mdpi.com/1999-4893/10/4/115",
acknowledgement = ack-nhfb,
articleno = "115",
fjournal = "Algorithms (Basel)",
journal-URL = "https://www.mdpi.com/journal/algorithms",
ORCID-numbers = "Huanqing Cui/0000-0002-9251-680X",
pagecount = "??",
pubdates = "Received: 28 August 2017 / Revised: 26 September 2017
/ Accepted: 9 October 2017 / Published: 10 October
author = "Hoang-Vu Dang and Marc Snir and William Gropp",
title = "Eliminating contention bottlenecks in multithreaded
volume = "69",
number = "??",
pages = "1--23",
month = nov,
year = "2017",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Tue Oct 24 15:15:02 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167819117301187",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Sudakshina Dutta and Dipankar Sarkar and Arvind
title = "Synchronization Validation for Cross-Thread
Dependences in Parallel Programs",
journal = j-INT-J-PARALLEL-PROG,
volume = "45",
number = "6",
pages = "1326--1365",
month = dec,
year = "2017",
DOI = "https://doi.org/10.1007/s10766-016-0467-9",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Sat Nov 18 09:27:28 MST 2017",
bibsource = "http://link.springer.com/journal/10766/45/6;
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Azadeh Farzan and Victor Nicolet",
title = "Synthesis of divide and conquer parallelism for
journal = j-SIGPLAN,
volume = "52",
number = "6",
pages = "540--555",
month = jun,
year = "2017",
DOI = "https://doi.org/10.1145/3140587.3062355",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:17 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Divide-and-conquer is a common parallel programming
skeleton supported by many cross-platform multithreaded
libraries, and most commonly used by programmers for
parallelization. The challenges of producing (manually
or automatically) a correct divide-and-conquer parallel
program from a given sequential code are two-fold: (1)
assuming that a good solution exists where individual
worker threads execute a code identical to the
sequential one, the programmer has to provide the extra
code for dividing the tasks and combining the partial
results (i.e. joins), and (2) the sequential code may
not be suitable for divide-and-conquer parallelization
as is, and may need to be modified to become a part of
a good solution. We address both challenges in this
paper. We present an automated synthesis technique to
synthesize correct joins and an algorithm for modifying
the sequential code to make it suitable for
parallelization when necessary. This paper focuses on
class of loops that traverse a read-only collection and
compute a scalar function over that collection. We
present theoretical results for when the necessary
modifications to sequential code are possible,
theoretical guarantees for the algorithmic solutions
presented here, and experimental evaluation of the
approach's success in practice and the quality of the
produced parallel programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '17 conference proceedings.",
author = "J. Feliu and J. Sahuquillo and S. Petit and J. Duato",
title = "{Perf Fair}: A Progress-Aware Scheduler to Enhance
Performance and Fairness in {SMT} Multicores",
journal = j-IEEE-TRANS-COMPUT,
volume = "66",
number = "5",
pages = "905--911",
month = may,
year = "2017",
DOI = "https://doi.org/10.1109/TC.2016.2620977",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Apr 6 07:46:06 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
keywords = "Bandwidth; Estimation; fairness; Interference;
multicore; Multicore processing; performance
estimation; Processor scheduling; Program processors;
Resource management; Scheduling; SMT",
author = "Vaidas Gasiunas and David Dominguez-Sal and Ralph
Acker and Aharon Avitzur and Ilan Bronshtein and Rushan
Chen and Eli Ginot and Norbert Martinez-Bazan and
Michael M{\"u}ller and Alexander Nozdrin and Weijie Ou
and Nir Pachter and Dima Sivov and Eliezer Levy",
title = "Fiber-based architecture for {NFV} cloud databases",
volume = "10",
number = "12",
pages = "1682--1693",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137774",
ISSN = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The telco industry is gradually shifting from using
monolithic software packages deployed on custom
hardware to using modular virtualized software
functions deployed on cloudified data centers using
commodity hardware. This transformation is referred to
as Network Function Virtualization (NFV). The
scalability of the databases (DBs) underlying the
virtual network functions is the cornerstone for
reaping the benefits from the NFV transformation. This
paper presents an industrial experience of applying
shared-nothing techniques in order to achieve the
scalability of a DB in an NFV setup. The special
combination of requirements in NFV DBs are not easily
met with conventional execution models. Therefore, we
designed a special shared-nothing architecture that is
based on cooperative multi-tasking using user-level
threads (fibers). We further show that the fiber-based
approach outperforms the approach built using
conventional multi-threading and meets the variable
deployment needs of the NFV transformation.
Furthermore, fibers yield a simpler-to-maintain
software and enable controlling a trade-off between
long-duration computations and real-time requests.",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Giorgis Georgakoudis and Hans Vandierendonck and Peter
Thoman and Bronis R. {De Supinski} and Thomas Fahringer
and Dimitrios S. Nikolopoulos",
title = "{SCALO}: Scalability-Aware Parallelism Orchestration
for Multi-Threaded Workloads",
journal = j-TACO,
volume = "14",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158643",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Shared memory machines continue to increase in scale
by adding more parallelism through additional cores and
complex memory hierarchies. Often, executing multiple
applications concurrently, dividing among them hardware
threads, provides greater efficiency rather than
executing a single application with large thread
counts. However, contention for shared resources can
limit the improvement of concurrent application
execution: orchestrating the number of threads used by
each application and is essential. In this article, we
contribute SCALO, a solution to orchestrate concurrent
application execution to increase throughput. SCALO
monitors co-executing applications at runtime to
evaluate their scalability. Its optimizing thread
allocator analyzes these scalability estimates to adapt
the parallelism of each program. Unlike previous
approaches, SCALO differs by including dynamic
contention effects on scalability and by controlling
the parallelism during the execution of parallel
regions. Thus, it improves throughput when other
state-of-the-art approaches fail and outperforms them
by up to 40\% when they succeed.",
acknowledgement = ack-nhfb,
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Kyriakos Georgiou and Steve Kerrison and Zbigniew
Chamski and Kerstin Eder",
title = "Energy Transparency for Deeply Embedded Programs",
journal = j-TACO,
volume = "14",
number = "1",
pages = "8:1--8:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3046679",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Energy transparency is a concept that makes a
program's energy consumption visible, from hardware up
to software, through the different system layers. Such
transparency can enable energy optimizations at each
layer and between layers, as well as help both
programmers and operating systems make energy-aware
decisions. In this article, we focus on deeply embedded
devices, typically used for Internet of Things (IoT)
applications, and demonstrate how to enable energy
transparency through existing static resource analysis
(SRA) techniques and a new target-agnostic profiling
technique, without hardware energy measurements. Our
novel mapping technique enables software energy
consumption estimations at a higher level than the
Instruction Set Architecture (ISA), namely the LLVM
intermediate representation (IR) level, and therefore
introduces energy transparency directly to the LLVM
optimizer. We apply our energy estimation techniques to
a comprehensive set of benchmarks, including single-
and multithreaded embedded programs from two commonly
used concurrency patterns: task farms and pipelines.
Using SRA, our LLVM IR results demonstrate a high
accuracy with a deviation in the range of 1\% from the
ISA SRA. Our profiling technique captures the actual
energy consumption at the LLVM IR level with an average
error of 3\%.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Ujjwal Gupta and Chetan Arvind Patil and Ganapati Bhat
and Prabhat Mishra and Umit Y. Ogras",
title = "{DyPO}: Dynamic {Pareto}-Optimal Configuration
Selection for Heterogeneous {MpSoCs}",
journal = j-TECS,
volume = "16",
number = "5s",
pages = "123:1--123:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3126530",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:33 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Modern multiprocessor systems-on-chip (MpSoCs) offer
tremendous power and performance optimization
opportunities by tuning thousands of potential voltage,
frequency and core configurations. As the workload
phases change at runtime, different configurations may
become optimal with respect to power, performance or
other metrics. Identifying the optimal configuration at
runtime is infeasible due to the large number of
workloads and configurations. This paper proposes a
novel methodology that can find the Pareto-optimal
configurations at runtime as a function of the
workload. To achieve this, we perform an extensive
offline characterization to find classifiers that map
performance counters to optimal configurations. Then,
we use these classifiers and performance counters at
runtime to choose Pareto-optimal configurations. We
evaluate the proposed methodology by maximizing the
performance per watt for 18 single- and multi-threaded
applications. Our experiments demonstrate an average
increase of 93\%, 81\% and 6\% in performance per watt
compared to the interactive, on demand and powersave
governors, respectively.",
acknowledgement = ack-nhfb,
articleno = "123",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Can Hankendi and Ayse Kivilcim Coskun",
title = "Scale \& Cap: Scaling-Aware Resource Management for
Consolidated Multi-threaded Applications",
journal = j-TODAES,
volume = "22",
number = "2",
pages = "30:1--30:??",
month = mar,
year = "2017",
DOI = "https://doi.org/10.1145/2994145",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Fri Jul 21 10:49:30 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "As the number of cores per server node increases,
designing multi-threaded applications has become
essential to efficiently utilize the available hardware
parallelism. Many application domains have started to
adopt multi-threaded programming; thus, efficient
management of multi-threaded applications has become a
significant research problem. Efficient execution of
multi-threaded workloads on cloud environments, where
applications are often consolidated by means of
virtualization, relies on understanding the
multi-threaded specific characteristics of the
applications. Furthermore, energy cost and power
delivery limitations require data center server nodes
to work under power caps, which bring additional
challenges to runtime management of consolidated
multi-threaded applications. This article proposes a
dynamic resource allocation technique for consolidated
multi-threaded applications for power-constrained
environments. Our technique takes into account
application characteristics specific to multi-threaded
applications, such as power and performance scaling, to
make resource distribution decisions at runtime to
improve the overall performance, while accurately
tracking dynamic power caps. We implement and evaluate
our technique on state-of-the-art servers and show that
the proposed technique improves the application
performance by up to 21\% under power caps compared to
a default resource manager.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and
A. Khayyat",
title = "Efficient Generation of Compact Execution Traces for
Multicore Architectural Simulations",
journal = j-TACO,
volume = "14",
number = "3",
pages = "27:1--27:??",
month = sep,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106342",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Sep 6 17:12:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Requiring no functional simulation, trace-driven
simulation has the potential of achieving faster
simulation speeds than execution-driven simulation of
multicore architectures. An efficient, on-the-fly,
high-fidelity trace generation method for multithreaded
applications is reported. The generated trace is
encoded in an instruction-like binary format that can
be directly ``interpreted'' by a timing simulator to
simulate a general load/store or x8-like architecture.
A complete tool suite that has been developed and used
for evaluation of the proposed method showed that it
produces smaller traces over existing trace compression
methods while retaining good fidelity including all
threading- and synchronization-related events.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Sungbo Jung and Dar-Jen Chang and Juw Won Park",
title = "Large scale document inversion using a multi-threaded
computing system",
journal = j-SIGAPP,
volume = "17",
number = "2",
pages = "27--35",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131080.3131083",
ISSN = "1559-6915 (print), 1931-0161 (electronic)",
ISSN-L = "1559-6915",
bibdate = "Thu Jan 23 10:25:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3131080.3131083",
abstract = "Current microprocessor architecture is moving towards
multi-core/multi-threaded systems. This trend has led
to a surge of interest in using multi-threaded
computing devices, such as the Graphics Processing Unit
(GPU), for general purpose computing. We \ldots{}",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGAPP Applied Computing Review",
journal-URL = "https://dl.acm.org/loi/sigapp",
author = "Steve Klabnik and Carol Nichols",
title = "The {Rust} Programming Language",
publisher = pub-NO-STARCH,
address = pub-NO-STARCH:adr,
pages = "xxvii + 519",
year = "2017",
ISBN = "1-59327-828-4 (paperback), 1-59327-851-9 (e-pub)",
ISBN-13 = "978-1-59327-828-1 (paperback), 978-1-59327-851-9
LCCN = "QA76.73.R87 K53 2018",
bibdate = "Thu Oct 31 18:42:15 MDT 2019",
bibsource = "fsz3950.oclc.org:210/WorldCat;
abstract = "\booktitle{The Rust Programming Language} is the
official book on Rust; a community-developed, systems
programming language that runs blazingly fast, prevents
segfaults, and guarantees thread safety. Rust's memory
safety guarantees, enforced at compile time, safeguard
your programs against the many problems that pervade
other systems languages. Rust offers the control and
performance of a low-level language with the helpful
abstractions of a high level one, and does this all
without having a garbage collector. These
characteristics make Rust useful for embedding in other
languages, programs with specific space and time
requirements, and writing low-level code, like device
drivers and operating systems. \booktitle{The Rust
Programming Language} begins with a quick hands-on
project to introduce the basics, then explores key
concepts in depth, such as ownership, the type system,
error handling, and fearless concurrency. Detailed
explanations of Rust-oriented takes on topics like
pattern matching, iterators, and smart pointers combine
with examples and exercises to take you from theory to
practice. In addition to its thorough coverage of more
granular topics, \booktitle{The Rust Programming
Language} will show you how to: * Grasp important
concepts unique to Rust like ownership, borrowing, and
lifetimes; * Use Cargo, Rust's built-in package
manager, to build your code, including downloading and
building dependencies; * Effectively use Rust's
zero-cost abstractions and learn to build your own.
Developed with help from the community, \booktitle{The
Rust Programming Language} is your official guide to
becoming a productive Rust programmer. The official
guide to Rust, a community-developed, systems
programming language. Begins with a hands-on project to
introduce the basics, then explores key concepts in
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
subject = "Computer programming; Programming languages
(Electronic computers); Computer programming.;
Programming languages (Electronic computers)",
tableofcontents = "Foreword / by Nicholas Matsakis and Aaron Turon \\
Introduction \\
1: Getting Started \\
2: A Quick Tutorial \\
Guessing Game \\
3: Common Programming Concepts \\
4: Understanding Ownership \\
5: Structs \\
6: Enums and Pattern Matching \\
7: Modules \\
8: Common Collections \\
9: Error Handling \\
10: Generic Types, Traits, and Lifetimes \\
11: Testing \\
12: An Input\slash Output Project \\
13: Functional Language Features in Rust \\
Iterators and Closures \\
14: More about Cargo and Crates io \\
15: Smart Pointers \\
16: Concurrency \\
17: Is Rust Object Oriented? \\
18: Patterns \\
19: More About Lifetimes \\
20: Advanced Type System Features \\
Appendix A: Keywords \\
Appendix B: Operators \\
Appendix C: Derivable Traits \\
Appendix D: Nightly Rust\ \\
Nightly Rust \\
author = "Amit Kleinmann and Avishai Wool",
title = "Automatic Construction of Statechart-Based Anomaly
Detection Models for Multi-Threaded Industrial Control
journal = j-TIST,
volume = "8",
number = "4",
pages = "55:1--55:??",
month = jul,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3011018",
ISSN = "2157-6904 (print), 2157-6912 (electronic)",
ISSN-L = "2157-6904",
bibdate = "Sat Dec 23 10:12:41 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Traffic of Industrial Control System (ICS) between the
Human Machine Interface (HMI) and the Programmable
Logic Controller (PLC) is known to be highly periodic.
However, it is sometimes multiplexed, due to
asynchronous scheduling. Modeling the network traffic
patterns of multiplexed ICS streams using Deterministic
Finite Automata (DFA) for anomaly detection typically
produces a very large DFA and a high false-alarm rate.
In this article, we introduce a new modeling approach
that addresses this gap. Our Statechart DFA modeling
includes multiple DFAs, one per cyclic pattern,
together with a DFA-selector that de-multiplexes the
incoming traffic into sub-channels and sends them to
their respective DFAs. We demonstrate how to
automatically construct the statechart from a captured
traffic stream. Our unsupervised learning algorithms
first build a Discrete-Time Markov Chain (DTMC) from
the stream. Next, we split the symbols into sets, one
per multiplexed cycle, based on symbol frequencies and
node degrees in the DTMC graph. Then, we create a
sub-graph for each cycle and extract Euler cycles for
each sub-graph. The final statechart is comprised of
one DFA per Euler cycle. The algorithms allow for
non-unique symbols, which appear in more than one
cycle, and also for symbols that appear more than once
in a cycle. We evaluated our solution on traces from a
production ICS using the Siemens S7-0x72 protocol. We
also stress-tested our algorithms on a collection of
synthetically-generated traces that simulated
multiplexed ICS traces with varying levels of symbol
uniqueness and time overlap. The algorithms were able
to split the symbols into sets with 99.6\% accuracy.
The resulting statechart modeled the traces with a
median false-alarm rate of as low as 0.483\%. In all
but the most extreme scenarios, the Statechart model
drastically reduced both the false-alarm rate and the
learned model size in comparison with the naive
single-DFA model.",
acknowledgement = ack-nhfb,
articleno = "55",
fjournal = "ACM Transactions on Intelligent Systems and Technology
journal-URL = "http://portal.acm.org/citation.cfm?id=J1318",
author = "Kensuke Kojima and Atsushi Igarashi",
title = "A {Hoare} Logic for {GPU} Kernels",
journal = j-TOCL,
volume = "18",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001834",
ISSN = "1529-3785 (print), 1557-945X (electronic)",
ISSN-L = "1529-3785",
bibdate = "Thu Apr 13 17:53:54 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tocl/;
abstract = "We study a Hoare Logic to reason about parallel
programs executed on graphics processing units (GPUs),
called GPU kernels. During the execution of GPU
kernels, multiple threads execute in lockstep, that is,
execute the same instruction simultaneously. When the
control branches, the two branches are executed
sequentially, but during the execution of each branch
only those threads that take it are enabled; after the
control converges, all the threads are enabled and
again execute in lockstep. In this article, we first
consider a semantics in which all threads execute in
lockstep (this semantics simplifies the actual
execution model of GPUs) and adapt Hoare Logic to this
setting by augmenting the usual Hoare triples with an
additional component representing the set of enabled
threads. It is determined that the soundness and
relative completeness of the logic do not hold for all
programs; a difficulty arises from the fact that one
thread can invalidate the loop termination condition of
another thread through shared memory. We overcome this
difficulty by identifying an appropriate class of
programs for which the soundness and relative
completeness hold. Additionally, we discuss thread
interleaving, which is present in the actual execution
of GPUs but not in the lockstep semantics mentioned
above. We show that if a program is race free, then the
lockstep and interleaving semantics produce the same
result. This implies that our logic is sound and
relatively complete for race-free programs, even if the
thread interleaving is taken into account.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Computational Logic",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J773",
author = "Maciej Komosinski and Szymon Ulatowski",
title = "Multithreaded computing in evolutionary design and in
artificial life simulations",
volume = "73",
number = "5",
pages = "2214--2228",
month = may,
year = "2017",
DOI = "https://doi.org/10.1007/s11227-016-1923-4",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Sat Jun 24 10:31:33 MDT 2017",
bibsource = "http://link.springer.com/journal/11227/73/5;
URL = "http://link.springer.com/content/pdf/10.1007/s11227-016-1923-4.pdf",
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Eryk Kopczy{\'n}ski and Szymon Toru{\'n}czyk",
title = "{LOIS}: syntax and semantics",
journal = j-SIGPLAN,
volume = "52",
number = "1",
pages = "586--598",
month = jan,
year = "2017",
DOI = "https://doi.org/10.1145/3093333.3009876",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:14 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We present the semantics of an imperative programming
language called LOIS (Looping Over Infinite Sets),
which allows iterating through certain infinite sets,
in finite time. Our semantics intuitively correspond to
execution of infinitely many threads in parallel. This
allows to merge the power of abstract mathematical
constructions into imperative programming. Infinite
sets are internally represented using first order
formulas over some underlying logical structure, and
SMT solvers are employed to evaluate programs.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "POPL '17 conference proceedings.",
author = "Doowon Lee and Valeria Bertacco",
title = "{MTraceCheck}: Validating Non-Deterministic Behavior
of Memory Consistency Models in Post-Silicon
journal = j-COMP-ARCH-NEWS,
volume = "45",
number = "2",
pages = "201--213",
month = may,
year = "2017",
DOI = "https://doi.org/10.1145/3140659.3080235",
ISSN = "0163-5964 (print), 1943-5851 (electronic)",
ISSN-L = "0163-5964",
bibdate = "Fri Sep 15 11:09:14 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This work presents a minimally-intrusive,
high-performance, post-silicon validation framework for
validating memory consistency in multi-core systems.
Our framework generates constrained-random tests that
are instrumented with observability-enhancing code for
memory consistency verification. For each test, we
generate a set of compact signatures reflecting the
memory-ordering patterns observed over many executions
of the test, with each of the signatures corresponding
to a unique memory-ordering pattern. We then leverage
an efficient and novel analysis to quickly determine if
the observed execution patterns represented by each
unique signature abide by the memory consistency model.
Our analysis derives its efficiency by exploiting the
structural similarities among the patterns observed. We
evaluated our framework, MTraceCheck, on two platforms:
an x86-based desktop and an ARM-based SoC platform,
both running multi-threaded test programs in a
bare-metal environment. We show that MTraceCheck
reduces the perturbation introduced by the
memory-ordering monitoring activity by 93\% on average,
compared to a baseline register flushing approach that
saves the register's state after each load operation.
We also reduce the computation requirements of our
consistency checking analysis by 81\% on average,
compared to a conventional topological sorting
solution. We finally demonstrate the effectiveness of
MTraceCheck on buggy designs, by evaluating multiple
case studies where it successfully exposes subtle bugs
in a full-system simulation environment.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGARCH Computer Architecture News",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89",
author = "Cha V. Li and Vinicius Petrucci and Daniel Moss{\'e}",
title = "Exploring Machine Learning for Thread Characterization
on Heterogeneous Multiprocessors",
journal = j-OPER-SYS-REV,
volume = "51",
number = "1",
pages = "113--123",
month = aug,
year = "2017",
DOI = "https://doi.org/10.1145/3139645.3139664",
ISSN = "0163-5980 (print), 1943-586X (electronic)",
ISSN-L = "0163-5980",
bibdate = "Fri Sep 15 10:37:05 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We introduce a thread characterization method that
explores hardware performance counters and machine
learning techniques to automate estimating workload
execution on heterogeneous processors. We show that our
characterization scheme achieves higher accuracy when
predicting performance indicators, such as instructions
per cycle and last-level cache misses, commonly used to
determine the mapping of threads to processor types at
runtime. We also show that support vector regression
achieves higher accuracy when compared to linear
regression, and has very low (1\%) overhead. The
results presented in this paper can provide a
foundation for advanced investigations and interesting
new directions in intelligent thread scheduling and
power management on multiprocessors.",
acknowledgement = ack-nhfb,
fjournal = "Operating Systems Review",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597",
author = "Yuxiang Li and Yinliang Zhao and Qiangsheng Wu",
title = "{GbA}: a graph-based thread partition approach in
speculative multithreading",
journal = j-CCPE,
volume = "29",
number = "21",
pages = "??--??",
day = "10",
month = nov,
year = "2017",
DOI = "https://doi.org/10.1002/cpe.4294",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat Dec 30 09:11:58 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
author = "Zhongwei Lin and Carl Tropper and Robert A. McDougal
and Mohammand Nazrul Ishlam Patoary and William W.
Lytton and Yiping Yao and Michael L. Hines",
title = "Multithreaded Stochastic {PDES} for Reactions and
Diffusions in Neurons",
journal = j-TOMACS,
volume = "27",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2017",
DOI = "https://doi.org/10.1145/2987373",
ISSN = "1049-3301 (print), 1558-1195 (electronic)",
ISSN-L = "1049-3301",
bibdate = "Tue Jul 11 15:41:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomacs/;
abstract = "Cells exhibit stochastic behavior when the number of
molecules is small. Hence a stochastic
reaction-diffusion simulator capable of working at
scale can provide a more accurate view of molecular
dynamics within the cell. This article describes a
parallel discrete event simulator, Neuron Time
Warp-Multi Thread (NTW-MT), developed for the
simulation of reaction diffusion models of neurons. To
the best of our knowledge, this is the first parallel
discrete event simulator oriented toward stochastic
simulation of chemical reactions in a neuron. The
simulator was developed as part of the NEURON project.
NTW-MT is optimistic and thread based, which attempts
to capitalize on multicore architectures used in high
performance machines. It makes use of a multilevel
queue for the pending event set and a single rollback
message in place of individual antimessages to disperse
contention and decrease the overhead of processing
rollbacks. Global Virtual Time is computed
asynchronously both within and among processes to get
rid of the overhead for synchronizing threads. Memory
usage is managed in order to avoid locking and
unlocking when allocating and deallocating memory and
to maximize cache locality. We verified our simulator
on a calcium buffer model. We examined its performance
on a calcium wave model, comparing it to the
performance of a process based optimistic simulator and
a threaded simulator which uses a single priority queue
for each thread. Our multithreaded simulator is shown
to achieve superior performance to these simulators.
Finally, we demonstrated the scalability of our
simulator on a larger Calcium-Induced Calcium Release
(CICR) model and a more detailed CICR model.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Modeling and Computer Simulation",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J781",
author = "Hao Luo and Pengcheng Li and Chen Ding",
title = "Thread Data Sharing in Cache: Theory and Measurement",
journal = j-SIGPLAN,
volume = "52",
number = "8",
pages = "103--115",
month = aug,
year = "2017",
DOI = "https://doi.org/10.1145/3155284.3018759",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Dec 1 18:56:12 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "On modern multi-core processors, independent workloads
often interfere with each other by competing for shared
cache space. However, for multi-threaded workloads,
where a single copy of data can be accessed by multiple
threads, the threads can cooperatively share cache.
Because data sharing consolidates the collective
working set of threads, the effective size of shared
cache becomes larger than it would have been when data
are not shared. This paper presents a new theory of
data sharing. It includes (1) a new metric called the
shared footprint to mathematically compute the amount
of data shared by any group of threads in any size
cache, and (2) a linear-time algorithm to measure
shared footprint by scanning the memory trace of a
multi-threaded program. The paper presents the
practical implementation and evaluates the new theory
using 14 PARSEC and SPEC OMP benchmarks, including an
example use of shared footprint in program
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '17 conference proceedings.",
author = "David Gonzalez Marquez and Adrian Cristal Kestelman
and Esteban Mocskos",
title = "{Mth}: Codesigned Hardware\slash Software Support for
Fine Grain Threads",
volume = "16",
number = "1",
pages = "64--67",
month = jan # "\slash " # jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2016.2606383",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Multi-core processors are ubiquitous in all market
segments from embedded to high performance computing,
but only few applications can efficiently utilize them.
Existing parallel frameworks aim to support
thread-level parallelism in applications, but the
imposed overhead prevents their usage for small problem
instances. This work presents Micro-threads (Mth) a
hardware-software proposal focused on a shared thread
management model enabling the use of parallel resources
in applications that have small chunks of parallel code
or small problem inputs by a combination of software
and hardware: delegation of the resource control to the
application, an improved mechanism to store and fill
processor's context, and an efficient synchronization
system. Four sample applications are used to test our
proposal: HSL filter (trivially parallel), FFT Radix2
(recursive algorithm), LU decomposition (barrier every
cycle) and Dantzig algorithm (graph based, matrix
manipulation). The results encourage the use of Mth and
could smooth the use of multiple cores for applications
that currently can not take advantage of the
proliferation of the available parallel resources in
each chip.",
acknowledgement = ack-nhfb,
affiliation = "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac
Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
RA-1053 Buenos Aires, DF, Argentina. Marquez, David
Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac
Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban,
CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF,
Argentina. Kestelman, Adrian Cristal, CSIC, IIIA,
Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain.
Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept
Comp Architecture, ES-08034 Barcelona, Spain.",
author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es
da = "2019-06-20",
doc-delivery-number = "EY5PB",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Universidad de Buenos Aires [UBACyT
20020130200096BA]; CONICET [PIP 11220110100379]",
funding-text = "This work was partially funded by grants from
Universidad de Buenos Aires (UBACyT 20020130200096BA)
and CONICET (PIP 11220110100379). The authors thank
specially Osman Unsal for reading this article with
fruitful criticism.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "multicore processing; multithreading; Parallel
architectures; parallel programming",
keywords-plus = "PARALLELISM",
number-of-cited-references = "11",
ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Marquez:2017:MCH",
web-of-science-categories = "Computer Science, Hardware \&
author = "George Matheou and Paraskevas Evripidou",
title = "Data-Driven Concurrency for High Performance
journal = j-TACO,
volume = "14",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3162014",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Dec 22 18:25:55 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In this work, we utilize dynamic dataflow/data-driven
techniques to improve the performance of high
performance computing (HPC) systems. The proposed
techniques are implemented and evaluated through an
efficient, portable, and robust programming framework
that enables data-driven concurrency on HPC systems.
The proposed framework is based on data-driven
multithreading (DDM), a hybrid control-flow/dataflow
model that schedules threads based on data availability
on sequential processors. The proposed framework was
evaluated using several benchmarks, with different
characteristics, on two different systems: a 4-node AMD
system with a total of 128 cores and a 64-node Intel
HPC system with a total of 768 cores. The performance
evaluation shows that the proposed framework scales
well and tolerates scheduling overheads and memory
latencies effectively. We also compare our framework to
MPI, DDM-VM, and OmpSs@Cluster. The comparison results
show that the proposed framework obtains comparable or
better performance.",
acknowledgement = ack-nhfb,
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Remigius Meier and Armin Rigo and Thomas R. Gross",
title = "Parallel virtual machines with {RPython}",
journal = j-SIGPLAN,
volume = "52",
number = "2",
pages = "48--59",
month = feb,
year = "2017",
DOI = "https://doi.org/10.1145/3093334.2989233",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:15 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The RPython framework takes an interpreter for a
dynamic language as its input and produces a Virtual
Machine{\^A} (VM) for that language. RPython is being
used to develop PyPy, a high-performance Python
interpreter. However, the produced VM does not support
parallel execution since the framework relies on a
Global Interpreter Lock{\^A} (GIL): PyPy serialises the
execution of multi-threaded Python programs. We
describe the rationale and design of a new parallel
execution model for RPython that allows the generation
of parallel virtual machines while leaving the language
semantics unchanged. This model then allows different
implementations of concurrency control, and we discuss
an implementation based on a GIL and an implementation
based on Software Transactional Memory{\^A} (STM). To
evaluate the benefits of either choice, we adapt PyPy
to work with both implementations (GIL and STM). The
evaluation shows that PyPy with STM improves the
runtime of a set of multi-threaded Python programs over
PyPy with a GIL by factors in the range of 1.87 $
\times $ up to 5.96 $ \times $ when executing on a
processor with 8 cores.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "DLS '16 conference proceedings.",
author = "Hosein Nazarpour and Yli{\`e}s Falcone and Saddek
Bensalem and Marius Bozga",
title = "Concurrency-preserving and sound monitoring of
multi-threaded component-based systems: theory,
algorithms, implementation, and evaluation",
journal = j-FORM-ASP-COMPUT,
volume = "29",
number = "6",
pages = "951--986",
month = nov,
year = "2017",
DOI = "https://doi.org/10.1007/s00165-017-0422-6",
ISSN = "0934-5043 (print), 1433-299X (electronic)",
ISSN-L = "0934-5043",
bibdate = "Thu Nov 23 07:37:44 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib;
URL = "http://link.springer.com/article/10.1007/s00165-017-0422-6",
acknowledgement = ack-nhfb,
fjournal = "Formal Aspects of Computing",
journal-URL = "http://link.springer.com/journal/165",
author = "James Nutaro and Bernard Zeigler",
title = "How to apply {Amdahl}'s law to multithreaded multicore
journal = j-J-PAR-DIST-COMP,
volume = "107",
number = "??",
pages = "1--2",
month = sep,
year = "2017",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Aug 19 13:10:31 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731517300941",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Jaehyun Park and Seungcheol Baek and Hyung Gyu Lee and
Chrysostomos Nicopoulos and Vinson Young and Junghee
Lee and Jongman Kim",
title = "{HoPE}: Hot-Cacheline Prediction for Dynamic Early
Decompression in Compressed {LLCs}",
journal = j-TODAES,
volume = "22",
number = "3",
pages = "40:1--40:??",
month = may,
year = "2017",
DOI = "https://doi.org/10.1145/2999538",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Fri Jul 21 10:49:30 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/todaes/;
abstract = "Data compression plays a pivotal role in improving
system performance and reducing energy consumption,
because it increases the logical effective capacity of
a compressed memory system without physically
increasing the memory size. However, data compression
techniques incur some cost, such as non-negligible
compression and decompression overhead. This overhead
becomes more severe if compression is used in the
cache. In this article, we aim to minimize the read-hit
decompression penalty in compressed Last-Level Caches
(LLCs) by speculatively decompressing frequently used
cachelines. To this end, we propose a Hot-cacheline
Prediction and Early decompression (HoPE) mechanism
that consists of three synergistic techniques:
Hot-cacheline Prediction (HP), Early Decompression
(ED), and Hit-history-based Insertion (HBI). HP and HBI
efficiently identify the hot compressed cachelines,
while ED selectively decompresses hot cachelines, based
on their size information. Unlike previous approaches,
the HoPE framework considers the performance
balance/tradeoff between the increased effective cache
capacity and the decompression penalty. To evaluate the
effectiveness of the proposed HoPE mechanism, we run
extensive simulations on memory traces obtained from
multi-threaded benchmarks running on a full-system
simulation framework. We observe significant
performance improvements over compressed cache schemes
employing the conventional Least-Recently Used (LRU)
replacement policy, the Dynamic Re-Reference Interval
Prediction (DRRIP) scheme, and the Effective Capacity
Maximizer (ECM) compressed cache management mechanism.
Specifically, HoPE exhibits system performance
improvements of approximately 11\%, on average, over
LRU, 8\% over DRRIP, and 7\% over ECM by reducing the
read-hit decompression penalty by around 65\%, over a
wide range of applications.",
acknowledgement = ack-nhfb,
articleno = "40",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Anuj Pathania and Vanchinathan Venkataramani and
Muhammad Shafique and Tulika Mitra and J{\"o}rg
title = "Defragmentation of Tasks in Many-Core Architecture",
journal = j-TACO,
volume = "14",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3050437",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jul 24 18:00:58 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Many-cores can execute multiple multithreaded tasks in
parallel. A task performs most efficiently when it is
executed over a spatially connected and compact subset
of cores so that performance loss due to communication
overhead imposed by the task's threads spread across
the allocated cores is minimal. Over a span of time,
unallocated cores can get scattered all over the
many-core, creating fragments in the task mapping.
These fragments can prevent efficient contiguous
mapping of incoming new tasks leading to loss of
performance. This problem can be alleviated by using a
task defragmenter, which consolidates smaller fragments
into larger fragments wherein the incoming tasks can be
efficiently executed. Optimal defragmentation of a
many-core is an NP-hard problem in the general case.
Therefore, we simplify the original problem to a
problem that can be solved optimally in polynomial
time. In this work, we introduce a concept of
exponentially separable mapping (ESM), which defines a
set of task mapping constraints on a many-core. We
prove that an ESM enforcing many-core can be
defragmented optimally in polynomial time.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Phillipe Pereira and Higo Albuquerque and Isabela da
Silva and Hendrio Marques and Felipe Monteiro and
Ricardo Ferreira and Lucas Cordeiro",
title = "{SMT}-based context-bounded model checking for {CUDA}
journal = j-CCPE,
volume = "29",
number = "22",
pages = "??--??",
day = "25",
month = nov,
year = "2017",
DOI = "https://doi.org/10.1002/cpe.3934",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat Dec 30 09:11:59 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
author = "Milan B. Radulovi{\'c} and Sylvain Girbal and Milo V.
title = "Low-level implementation of the {SISC} protocol for
thread-level speculation on a multi-core architecture",
volume = "67",
number = "??",
pages = "1--19",
month = sep,
year = "2017",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Wed Aug 9 14:49:25 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167819117300972",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Oliver Reiche and Christof Kobylko and Frank Hannig
and J{\"u}rgen Teich",
title = "Auto-vectorization for image processing {DSLs}",
journal = j-SIGPLAN,
volume = "52",
number = "4",
pages = "21--30",
month = may,
year = "2017",
DOI = "https://doi.org/10.1145/3140582.3081039",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Sat Sep 16 10:18:15 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The parallelization of programs and distributing their
workloads to multiple threads can be a challenging
task. In addition to multi-threading, harnessing vector
units in CPUs proves highly desirable. However,
employing vector units to speed up programs can be
quite tedious. Either a program developer solely relies
on the auto-vectorization capabilities of the compiler
or he manually applies vector intrinsics, which is
extremely error-prone, difficult to maintain, and not
portable at all. Based on whole-function vectorization,
a method to replace control flow with data flow, we
propose auto-vectorization techniques for image
processing DSLs in the context of source-to-source
compilation. The approach does not require the input to
be available in SSA form. Moreover, we formulate
constraints under which the vectorization analysis and
code transformations may be greatly simplified in the
context of image processing DSLs. As part of our
methodology, we present control flow to data flow
transformation as a source-to-source translation.
Moreover, we propose a method to efficiently analyze
algorithms with mixed bit-width data types to determine
the optimal SIMD width, independently of the target
instruction set. The techniques are integrated into an
open source DSL framework. Subsequently, the
vectorization capabilities are compared to a variety of
existing state-of-the-art C/C++ compilers. A geometric
mean speedup of up to 3.14 is observed for benchmarks
taken from ISPC and image processing, compared to
non-vectorized executions.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "LCTES '17 conference proceedings.",
author = "Olli Saarikivi and Hern{\'a}n Ponce-De-Le{\'o}n and
Kari K{\"a}hk{\"o}nen and Keijo Heljanko and Javier
title = "Minimizing Test Suites with Unfoldings of
Multithreaded Programs",
journal = j-TECS,
volume = "16",
number = "2",
pages = "45:1--45:??",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3012281",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Mon Jul 24 09:51:12 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This article focuses on computing minimal test suites
for multithreaded programs. Based on previous work on
test case generation for multithreaded programs using
unfoldings, this article shows how this unfolding can
be used to generate minimal test suites covering all
local states of the program. Generating such minimal
test suites is shown to be NP-complete in the size of
the unfolding. We propose an SMT encoding for this
problem and two methods based on heuristics which only
approximate the solution, but scale better in practice.
Finally, we apply our methods to compute the minimal
test suites for several benchmarks.",
acknowledgement = ack-nhfb,
articleno = "45",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Conrad Sanderson and Ryan Curtin",
title = "\pkg{gmm\_diag} and \pkg{gmm\_full}: {C++} classes for
multi-threaded {Gaussian} mixture models and
journal = j-J-OPEN-SOURCE-SOFT,
volume = "2",
number = "18",
pages = "365:1--365:2",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.21105/joss.00365",
ISSN = "2475-9066",
ISSN-L = "2475-9066",
bibdate = "Thu Sep 13 08:09:35 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/joss.bib;
URL = "http://joss.theoj.org/papers/10.21105/joss.00365",
acknowledgement = ack-nhfb,
fjournal = "Journal of Open Source Software",
journal-URL = "http://joss.theoj.org/;
onlinedate = "16 October 2017",
ORCID-numbers = "Conrad Sanderson / 0000-0002-0049-4501; Ryan Curtin /
author = "Benjamin Carrion Schafer",
title = "Parallel High-Level Synthesis Design Space Exploration
for Behavioral {IPs} of Exact Latencies",
journal = j-TODAES,
volume = "22",
number = "4",
pages = "65:1--65:??",
month = jul,
year = "2017",
DOI = "https://doi.org/10.1145/3041219",
ISSN = "1084-4309 (print), 1557-7309 (electronic)",
ISSN-L = "1084-4309",
bibdate = "Mon Jan 22 09:03:32 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This works presents a Design Space Exploration (DSE)
method for Behavioral IPs (BIPs) given in ANSI-C or
SystemC to find the smallest micro-architecture for a
specific target latency. Previous work on High-Level
Synthesis (HLS) DSE mainly focused on finding a
tradeoff curve with Pareto-optimal designs. HLS is,
however, a single process (component) synthesis method.
Very often, the latency of the components requires a
specific fixed latency when inserted within a larger
system. This work presents a fast multi-threaded method
to find the smallest micro-architecture for a given BIP
and target latency by discriminating between all
different exploration knobs and exploring these
concurrently. Experimental results show that our
proposed method is very effective and comprehensive
results compare the quality of results vs. the speedup
of your proposed explorer.",
acknowledgement = ack-nhfb,
articleno = "65",
fjournal = "ACM Transactions on Design Automation of Electronic
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776",
author = "Z. Tian and T. Liu and Q. Zheng and E. Zhuang and M.
Fan and Z. Yang",
title = "Reviving Sequential Program Birthmarking for
Multithreaded Software Plagiarism Detection",
volume = "PP",
number = "99",
pages = "1--1",
month = "????",
year = "2017",
DOI = "https://doi.org/10.1109/TSE.2017.2688383",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Feb 1 19:49:24 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7888597",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Yatish Turakhia and Guangshuo Liu and Siddharth Garg
and Diana Marculescu",
title = "Thread Progress Equalization: Dynamically Adaptive
Power-Constrained Performance Optimization of
Multi-Threaded Applications",
journal = j-IEEE-TRANS-COMPUT,
volume = "66",
number = "4",
pages = "731--744",
month = "????",
year = "2017",
DOI = "https://doi.org/10.1109/TC.2016.2608951",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Sat Mar 11 14:24:09 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Robert Utterback and Kunal Agrawal and I-Ting Angelina
Lee and Milind Kulkarni",
title = "Processor-Oblivious Record and Replay",
journal = j-SIGPLAN,
volume = "52",
number = "8",
pages = "145--161",
month = aug,
year = "2017",
DOI = "https://doi.org/10.1145/3155284.3018764",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Dec 1 18:56:12 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Record-and-replay systems are useful tools for
debugging non-deterministic parallel programs by first
recording an execution and then replaying that
execution to produce the same access pattern. Existing
record-and-replay systems generally target thread-based
execution models, and record the behaviors and
interleavings of individual threads. Dynamic
multithreaded languages and libraries, such as the Cilk
family, OpenMP, TBB, etc., do not have a notion of
threads. Instead, these languages provide a
processor-oblivious model of programming, where
programs expose task-parallelism using high-level
constructs such as spawn/sync without regard to the
number of threads/cores available to run the program.
Thread-based record-and-replay would violate the
processor-oblivious nature of these programs, as they
incorporate the number of threads into the recorded
information, constraining the replayed execution to the
same number of threads. In this paper, we present a
processor-oblivious record-and-replay scheme for such
languages where record and replay can use different
number of processors and both are scheduled using work
stealing. We provide theoretical guarantees for our
record and replay scheme --- namely that record is
optimal for programs with one lock and replay is
near-optimal for all cases. In addition, we implemented
this scheme in the Cilk Plus runtime system and our
evaluation indicates that processor-obliviousness does
not cause substantial overheads.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '17 conference proceedings.",
author = "Kaiyuan Wang and Sarfraz Khurshid and Milos Gligoric",
title = "{JPR}: Replaying {JPF} Traces Using Standard {JVM}",
journal = j-SIGSOFT,
volume = "42",
number = "4",
pages = "1--5",
month = oct,
year = "2017",
DOI = "https://doi.org/10.1145/3149485.3149494",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Aug 1 17:16:48 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
abstract = "Java PathFinder (JPF) is a backtrackable Java Virtual
Machine (JVM), which is implemented in Java and runs on
a standard JVM (e.g., Oracle HotSpot). Thus, a JPF
developer can use off-the-shelf Java debuggers (e.g.,
jdb) when debugging code that makes up JPF. JPF
explores all non-deterministic executions of a given
target program and monitors for property violations. To
facilitate debugging of the target program, JPF can
capture and replay the execution trace that leads to a
property violation. While the deterministic replay is
invaluable, the replay with JPF does not allow the
developer to attach an off-the-shelf Java debugger to
the target program (e.g., step through the application
code, set breakpoints, etc.). We present a technique,
dubbed JPR, to improve the debugging experience of the
JPF captured traces by migrating the JPF traces to a
new format that can be executed using the standard JVM.
JPR annotates each JPF trace, during the capture phase,
with extra data (e.g., instruction index, instruction
count, etc.); the annotated trace is then used to
instrument Java bytecode to enforce the same execution
trace on a standard JVM. JPR is compatible with various
optimizations, e.g., state matching and partial-order
reduction. We evaluated JPR on all multithreaded Java
programs in the official JPF distribution. Our results
show that JPR successfully replayed all JPF traces on
the standard JVM with reasonable overhead during both
recording and replaying.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/citation.cfm?id=J728",
author = "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
Rudolf Eigenmann and Timothy G. Rogers",
title = "{Pagoda}: Fine-Grained {GPU} Resource Virtualization
for Narrow Tasks",
journal = j-SIGPLAN,
volume = "52",
number = "8",
pages = "221--234",
month = aug,
year = "2017",
DOI = "https://doi.org/10.1145/3155284.3018754",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Dec 1 18:56:12 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Massively multithreaded GPUs achieve high throughput
by running thousands of threads in parallel. To fully
utilize the hardware, workloads spawn work to the GPU
in bulk by launching large tasks, where each task is a
kernel that contains thousands of threads that occupy
the entire GPU. GPUs face severe underutilization and
their performance benefits vanish if the tasks are
narrow, i.e., they contain {$<$} 500 threads.
Latency-sensitive applications in network, signal, and
image processing that generate a large number of tasks
with relatively small inputs are examples of such
limited parallelism. This paper presents Pagoda, a
runtime system that virtualizes GPU resources, using an
OS-like daemon kernel called MasterKernel. Tasks are
spawned from the CPU onto Pagoda as they become
available, and are scheduled by the MasterKernel at the
warp granularity. Experimental results demonstrate that
Pagoda achieves a geometric mean speedup of 5.70x over
PThreads running on a 20-core CPU, 1.51x over
CUDA-HyperQ, and 1.69x over GeMTC, the state-of-
the-art runtime GPU task scheduling system.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PPoPP '17 conference proceedings.",
author = "Joel C. Adams and Patrick A. Crain and Christopher P.
Dilley and Christiaan D. Hazlett and Elizabeth R.
Koning and Serita M. Nelesen and Javin B. Unger and
Mark B. Vande Stel",
title = "{TSGL}: A tool for visualizing multithreaded
journal = j-J-PAR-DIST-COMP,
volume = "118 (part 1)",
number = "??",
pages = "233--246",
month = aug,
year = "2018",
DOI = "https://doi.org/10.1016/j.jpdc.2018.02.025",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat May 12 16:27:31 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "https://www.sciencedirect.com/science/article/pii/S0743731518301035",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Laith M. AlBarakat and Paul {Gratz, V} and Daniel A.
title = "{MTB-Fetch}: Multithreading Aware Hardware Prefetching
for Chip Multiprocessors",
volume = "17",
number = "2",
pages = "175--178",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2847345",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "To fully exploit the scaling performance in Chip
Multiprocessors, applications must be divided into
semi-independent processes that can run concurrently on
multiple cores within a system. One major class of such
applications, shared-memory, multi-threaded
applications, requires programmers insert thread
synchronization primitives (i.e., locks, barriers, and
condition variables) in their critical sections to
synchronize data access between processes. For this
class of applications, scaling performance requires
balanced per-thread workloads with little time spent in
critical sections. In practice, however, threads often
waste significant time waiting to acquire
locks/barriers in their critical sections, leading to
thread imbalance and poor performance scaling.
Moreover, critical sections often stall data
prefetchers that mitigate the effects of long critical
section stalls by ensuring data is preloaded in the
core caches when the critical section is complete. In
this paper we examine a pure hardware technique to
enable safe data prefetching beyond synchronization
points in CMPs. We show that successful prefetching
beyond synchronization points requires overcoming two
significant challenges in existing prefetching
techniques. First, we find that typical data
prefetchers are designed to trigger prefetches based on
current misses. This approach this works well for
traditional, continuously executing, single-threaded
applications. However, when a thread stalls on a
synchronization point, it typically does not produce
any new memory references to trigger a prefetcher.
Second, even in the event that a prefetch were to be
correctly directed to read beyond a synchronization
point, it will likely prefetch shared data from another
core before this data has been written. While this
prefetch would be considered ``{accurate''} it is
highly undesirable, because such a prefetch would lead
to three extra ``ping-{pong''} movements back and forth
between private caches in the producing and consuming
cores, incurring more latency and energy overhead than
without prefetching. We develop a new data prefetcher,
Multi-Thread B-Fetch (MTBFetch), built as an extension
to a previous single-threaded data prefetcher. MTBFetch
addresses both issues in prefetching for shared memory
multi-threaded workloads. MTB-Fetch achieves a speedup
of 9.3 percent for multi-threaded applications with
little additional hardware.",
acknowledgement = ack-nhfb,
affiliation = "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept
Elect \& Comp Engn, College Stn, TX 77843 USA.
AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ,
Dept Elect \& Comp Engn, College Stn, TX 77843 USA.
Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \&
Engn, College Stn, TX 77843 USA.",
author-email = "lalbarakat@tamu.edu pgratz@tamu.edu
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation
[I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598];
Intel Corp.",
funding-text = "We thank the National Science Foundation, which
partially supported this work through grants
I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and
Intel Corp. for their generous support.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Chip multiprocessor; hardware prefetching;
multi-threading; shared memory",
keywords-plus = "PROCESSORS",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "AlBarakat:2018:MFM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
Milind Chabbi and Yanjie Wei and Jeff Hammond and
Satoshi Matsuoka",
title = "Lock Contention Management in Multithreaded {MPI}",
journal = j-TOPC,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3275443",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3275443",
abstract = "In this article, we investigate contention management
in lock-based thread-safe MPI libraries. Specifically,
we make two assumptions: (1) locks are the only form of
synchronization when protecting communication paths;
and (2) contention occurs, and thus serialization is
unavoidable. Our work distinguishes between lock
acquisitions with respect to work being performed
inside a critical section; productive vs. unproductive.
Waiting for message reception without doing anything
else inside a critical section is an example of
unproductive lock acquisition. We show that the
high-throughput nature of modern scalable locking
protocols translates into better communication progress
for throughput-intensive MPI communication but
negatively impacts latency-sensitive communication
because of overzealous unproductive lock acquisition.
To reduce unproductive lock acquisitions, we devised a
method that promotes threads with productive work using
a generic two-level priority locking protocol. Our
results show that using a high-throughput protocol for
productive work and a fair protocol for less productive
code paths ensures the best tradeoff for fine-grained
communication, whereas a fair protocol is sufficient
for more coarse-grained communication. Although these
efforts have been rewarding, scalability degradation
remains significant. We discuss techniques that diverge
from the pure locking model and offer the potential to
further improve scalability.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Kuan-Chung Chen and Chung-Ho Chen",
title = "Enabling {SIMT} Execution Model on Homogeneous
Multi-Core System",
journal = j-TACO,
volume = "15",
number = "1",
pages = "6:1--6:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177960",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Single-instruction multiple-thread (SIMT) machine
emerges as a primary computing device in
high-performance computing, since the SIMT execution
paradigm can exploit data-level parallelism
effectively. This article explores the SIMT execution
potential on homogeneous multi-core processors, which
generally run in multiple-instruction multiple-data
(MIMD) mode when utilizing the multi-core resources. We
address three architecture issues in enabling SIMT
execution model on multi-core processor, including
multithreading execution model, kernel thread context
placement, and thread divergence. For the SIMT
execution model, we propose a fine-grained
multithreading mechanism on an ARM-based multi-core
system. Each of the processor cores stores the kernel
thread contexts in its L1 data cache for per-cycle
thread-switching requirement. For divergence-intensive
kernels, an Inner Conditional Statement First
(ICS-First) mechanism helps early re-convergence to
occur and significantly improves the performance. The
experiment results show that effectiveness in
data-parallel processing reduces on average 36\%
dynamic instructions, and boosts the SIMT executions to
achieve on average 1.52$ \times $ and up to 5$ \times $
speedups over the MIMD counterpart for OpenCL
benchmarks for single issue in-order processor cores.
By using the explicit vectorization optimization on the
kernels, the SIMT model gains further benefits from the
SIMD extension and achieves 1.71$ \times $ speedup over
the MIMD approach. The SIMT model using in-order
superscalar processor cores outperforms the MIMD model
that uses superscalar out-of-order processor cores by
40\%. The results show that, to exploit data-level
parallelism, enabling the SIMT model on homogeneous
multi-core processors is important.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Kuan-Hsun Chen and Georg von der Br{\"u}ggen and
Jian-Jia Chen",
title = "Reliability Optimization on Multi-Core Systems with
Multi-Tasking and Redundant Multi-Threading",
journal = j-IEEE-TRANS-COMPUT,
volume = "67",
number = "4",
pages = "484--497",
month = "????",
year = "2018",
DOI = "https://doi.org/10.1109/TC.2017.2769044",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Mar 15 08:52:31 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
URL = "http://ieeexplore.ieee.org/document/8094023/",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Wei-Sheng Chin and Bo-Wen Yuan and Meng-Yuan Yang and
Chih-Jen Lin",
title = "An Efficient Alternating {Newton} Method for Learning
Factorization Machines",
journal = j-TIST,
volume = "9",
number = "6",
pages = "72:1--72:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3230710",
ISSN = "2157-6904 (print), 2157-6912 (electronic)",
ISSN-L = "2157-6904",
bibdate = "Thu Nov 15 16:23:08 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3230710",
abstract = "To date, factorization machines (FMs) have emerged as
a powerful model in many applications. In this work, we
study the training of FM with the logistic loss for
binary classification, which is a nonlinear extension
of the linear model with the logistic loss (i.e.,
logistic regression). For the training of large-scale
logistic regression, Newton methods have been shown to
be an effective approach, but it is difficult to apply
such methods to FM because of the nonconvexity. We
consider a modification of FM that is multiblock convex
and propose an alternating minimization algorithm based
on Newton methods. Some novel optimization techniques
are introduced to reduce the running time. Our
experiments demonstrate that the proposed algorithm is
more efficient than stochastic gradient algorithms and
coordinate descent methods. The parallelism of our
method is also investigated for the acceleration in
multithreading environments.",
acknowledgement = ack-nhfb,
articleno = "72",
fjournal = "ACM Transactions on Intelligent Systems and Technology
journal-URL = "http://portal.acm.org/citation.cfm?id=J1318",
author = "Enrico A. Deiana and Vincent St-Amour and Peter A.
Dinda and Nikos Hardavellas and Simone Campanoni",
title = "Unconventional Parallelization of Nondeterministic
journal = j-SIGPLAN,
volume = "53",
number = "2",
pages = "432--447",
month = feb,
year = "2018",
DOI = "https://doi.org/10.1145/3296957.3173181",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:56 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "The demand for thread-level-parallelism (TLP) on
commodity processors is endless as it is essential for
gaining performance and saving energy. However, TLP in
today's programs is limited by dependences that must be
satisfied at run time. We have found that for
nondeterministic programs, some of these actual
dependences can be satisfied with alternative data that
can be generated in parallel, thus boosting the
program's TLP. Satisfying these dependences with
alternative data nonetheless produces final outputs
that match those of the original nondeterministic
program. To demonstrate the practicality of our
technique, we describe the design, implementation, and
evaluation of our compilers, autotuner, profiler, and
runtime, which are enabled by our proposed C++
programming language extensions. The resulting system
boosts the performance of six well-known
nondeterministic and multi-threaded benchmarks by
158.2\% (geometric mean) on a 28-core Intel-based
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '18 proceedings.",
author = "Christian DeLozier and Ariel Eizenberg and Brandon
Lucia and Joseph Devietti",
title = "{SOFRITAS}: Serializable Ordering-Free Regions for
Increasing Thread Atomicity Scalably",
journal = j-SIGPLAN,
volume = "53",
number = "2",
pages = "286--300",
month = feb,
year = "2018",
DOI = "https://doi.org/10.1145/3296957.3173192",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:56 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Correctly synchronizing multithreaded programs is
challenging and errors can lead to program failures
such as atomicity violations. Existing strong memory
consistency models rule out some possible failures, but
are limited by depending on programmer-defined locking
code. We present the new Ordering-Free Region (OFR)
serializability consistency model that ensures
atomicity for OFRs, which are spans of dynamic
instructions between consecutive ordering constructs
(e.g., barriers), without breaking atomicity at lock
operations. Our platform, Serializable Ordering-Free
Regions for Increasing Thread Atomicity Scalably
(SOFRITAS), ensures a C/C++ program's execution is
equivalent to a serialization of OFRs by default. We
build two systems that realize the SOFRITAS idea: a
concurrency bug finding tool for testing called
SOFRITEST, and a production runtime system called
SOPRO. SOFRITEST uses OFRs to find concurrency bugs,
including a multi-critical-section atomicity violation
in memcached that weaker consistency models will miss.
If OFR's are too coarse-grained, SOFRITEST suggests
refinement annotations automatically. Our software-only
SOPRO implementation has high performance, scales well
with increased parallelism, and prevents failures
despite bugs in locking code. SOFRITAS has an average
overhead of just 1.59x on a single-threaded execution
and 1.51x on sixteen threads, despite pthreads' much
weaker memory model.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '18 proceedings.",
author = "Sander {De Pestel} and Sam {Van den Steen} and Shoaib
Akram and Lieven Eeckhout",
title = "{RPPM}: Rapid Performance Prediction of Multithreaded
Applications on Multicore Hardware",
volume = "17",
number = "2",
pages = "183--186",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2849983",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "This paper proposes RPPM which, based on a
microarchitecture-independent profile of a
multithreaded application, predicts its performance on
a previously unseen multicore platform. RPPM breaks up
multithreaded program execution into epochs based on
synchronization primitives, and then predicts per-epoch
active execution times for each thread and
synchronization overhead to arrive at a prediction for
overall application performance. RPPM predicts
performance within 12 percent on average (27 percent
max error) compared to cycle-level simulation. We
present a case study to illustrate that RPPM can be
used for making accurate multicore design trade-offs
early in the design cycle.",
acknowledgement = ack-nhfb,
affiliation = "De Pestel, S (Reprint Author), Univ Ghent, B-9000
Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam;
Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000
Ghent, Belgium.",
author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be
shoaib.akram@ugent.be lieven.eeckhout@ugent.be",
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Agency for Innovation by Science and
Technology in Flanders (IWT); European Research Council
(ERC) [741097]",
funding-text = "Sander De Pestel is supported through a doctoral
fellowship by the Agency for Innovation by Science and
Technology in Flanders (IWT). Additional support is
provided through the European Research Council (ERC)
Advanced Grant agreement no. 741097.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "micro-architecture; Modeling; multi-threaded;
number-of-cited-references = "12",
ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Pestel:2018:RRP",
web-of-science-categories = "Computer Science, Hardware \&
author = "Mehmet Deveci and Christian Trott and Sivasankaran
title = "Multithreaded sparse matrix--matrix multiplication for
many-core and {GPU} architectures",
volume = "78",
number = "??",
pages = "33--46",
month = oct,
year = "2018",
DOI = "https://doi.org/10.1016/j.parco.2018.06.009",
ISSN = "0167-8191 (print), 1872-7336 (electronic)",
ISSN-L = "0167-8191",
bibdate = "Mon Jan 7 15:25:20 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167819118301923",
acknowledgement = ack-nhfb,
fjournal = "Parallel Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/01678191",
author = "Bailu Ding and Lucja Kot and Johannes Gehrke",
title = "Improving optimistic concurrency control through
transaction batching and operation reordering",
volume = "12",
number = "2",
pages = "169--182",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282502",
ISSN = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "OLTP systems can often improve throughput by batching
transactions and processing them as a group. Batching
has been used for optimizations such as message packing
and group commits; however, there is little research on
the benefits of a holistic approach to batching across
a transaction's entire life cycle. In this paper, we
present a framework to incorporate batching at multiple
stages of transaction execution for OLTP systems based
on optimistic concurrency control. Storage batching
enables reordering of transaction reads and writes at
the storage layer, reducing conflicts on the same
object. Validator batching enables reordering of
transactions before validation, reducing conflicts
between transactions. Dependencies between transactions
make transaction reordering a non-trivial problem, and
we propose several efficient and practical algorithms
that can be customized to various transaction
precedence policies such as reducing tail latency. We
also show how to reorder transactions with a
thread-aware policy in multi-threaded OLTP architecture
without a centralized validator. In-depth experiments
on a research prototype, an opensource OLTP system, and
a production OLTP system show that our techniques
increase transaction throughput by up to 2.2x and
reduce their tail latency by up to 71\% compared with
the start-of-the-art systems on workloads with high
data contention.",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Joscha Drechsler and Ragnar Mogk and Guido Salvaneschi
and Mira Mezini",
title = "Thread-safe reactive programming",
journal = j-PACMPL,
volume = "2",
number = "OOPSLA",
pages = "107:1--107:30",
month = oct,
year = "2018",
DOI = "https://doi.org/10.1145/3276477",
bibdate = "Sat Aug 8 07:56:30 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3276477",
abstract = "The execution of an application written in a reactive
language involves transfer of data and control flow
between imperative and reactive abstractions at
well-defined points. In a multi-threaded environment,
multiple such interactions may execute \ldots{}",
acknowledgement = ack-nhfb,
articleno = "107",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Jordan Fix and Nayana P. Nagendra and Sotiris
Apostolakis and Hansen Zhang and Sophie Qiu and David
I. August",
title = "Hardware Multithreaded Transactions",
journal = j-SIGPLAN,
volume = "53",
number = "2",
pages = "15--29",
month = feb,
year = "2018",
DOI = "https://doi.org/10.1145/3296957.3173172",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:56 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Speculation with transactional memory systems helps
programmers and compilers produce profitable
thread-level parallel programs. Prior work shows that
supporting transactions that can span multiple threads,
rather than requiring transactions be contained within
a single thread, enables new types of speculative
parallelization techniques for both programmers and
parallelizing compilers. Unfortunately, software
support for multi-threaded transactions (MTXs) comes
with significant additional inter-thread communication
overhead for speculation validation. This overhead can
make otherwise good parallelization unprofitable for
programs with sizeable read and write sets. Some
programs using these prior software MTXs overcame this
problem through significant efforts by expert
programmers to minimize these sets and optimize
communication, capabilities which compiler technology
has been unable to equivalently achieve. Instead, this
paper makes speculative parallelization less laborious
and more feasible through low-overhead speculation
validation, presenting the first complete design,
implementation, and evaluation of hardware MTXs. Even
with maximal speculation validation of every load and
store inside transactions of tens to hundreds of
millions of instructions, profitable parallelization of
complex programs can be achieved. Across 8 benchmarks,
this system achieves a geomean speedup of 99\% over
sequential execution on a multicore machine with 4
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "ASPLOS '18 proceedings.",
author = "Martti Forsell and Jussi Roivainen and Ville
title = "{REPLICA MBTAC}: multithreaded dual-mode processor",
volume = "74",
number = "5",
pages = "1911--1933",
month = may,
year = "2018",
DOI = "https://doi.org/10.1007/s11227-017-2199-z",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Thu Oct 10 15:31:12 MDT 2019",
bibsource = "http://link.springer.com/journal/11227/74/5;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Alexandros V. Gerbessiotis",
title = "A Study of Integer Sorting on Multicores",
volume = "28",
number = "04",
pages = "??--??",
month = dec,
year = "2018",
DOI = "https://doi.org/10.1142/S0129626418500147",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Mon Mar 29 12:30:05 MDT 2021",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626418500147",
abstract = "Integer sorting on multicores and GPUs can be realized
by a variety of approaches that include variants of
distribution-based methods such as radix-sort,
comparison-oriented algorithms such as deterministic
regular sampling and random sampling parallel sorting,
and network-based algorithms such as Batcher's bitonic
sorting algorithm. In this work we present an
experimental study of integer sorting on multicore
processors. We have implemented serial and parallel
radix-sort for various radixes, deterministic regular
oversampling, and random oversampling parallel sorting,
including new variants of ours, and also some
previously little explored or unexplored variants of
bitonic-sort and odd-even transposition sort. The study
uses multithreading and multiprocessing parallel
programming libraries with the same C language code
working under Open MPI, MulticoreBSP, and BSPlib. We
first provide some general high-level observations on
the performance of these implementations. If we can
conclude anything is that accurate prediction of
performance by taking into consideration architecture
dependent features such as the structure and
characteristics of multiple memory hierarchies is
difficult and more often than not untenable. To some
degree this is affected by the overhead imposed by the
high-level library used in the programming effort.
Another objective is to model the performance of these
algorithms and their implementations under the MBSP
(Multi-memory BSP) model. Despite the limitations
mentioned above, we can still draw some reliable
conclusions and reason about the performance of these
implementations using the MBSP model, thus making MBSP
useful and usable.",
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Ronghui Gu and Zhong Shao and Jieung Kim and Xiongnan
(Newman) Wu and J{\'e}r{\'e}mie Koenig and Vilhelm
Sj{\"o}berg and Hao Chen and David Costanzo and Tahina
title = "Certified concurrent abstraction layers",
journal = j-SIGPLAN,
volume = "53",
number = "4",
pages = "646--661",
month = apr,
year = "2018",
DOI = "https://doi.org/10.1145/3296979.3192381",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:57 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Concurrent abstraction layers are ubiquitous in modern
computer systems because of the pervasiveness of
multithreaded programming and multicore hardware.
Abstraction layers are used to hide the implementation
details (e.g., fine-grained synchronization) and reduce
the complex dependencies among components at different
levels of abstraction. Despite their obvious
importance, concurrent abstraction layers have not been
treated formally. This severely limits the
applicability of layer-based techniques and makes it
difficult to scale verification across multiple
concurrent layers. In this paper, we present CCAL---a
fully mechanized programming toolkit developed under
the CertiKOS project---for specifying, composing,
compiling, and linking certified concurrent abstraction
layers. CCAL consists of three technical novelties: a
new game-theoretical, strategy-based compositional
semantic model for concurrency (and its associated
program verifiers), a set of formal linking theorems
for composing multithreaded and multicore concurrent
layers, and a new CompCertX compiler that supports
certified thread-safe compilation and linking. The CCAL
toolkit is implemented in Coq and supports layered
concurrent programming in both C and assembly. It has
been successfully applied to build a fully certified
concurrent OS kernel with fine-grained locking.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '18 proceedings.",
author = "Saurabh Hukerikar and Keita Teranishi and Pedro C.
Diniz and Robert F. Lucas",
title = "{RedThreads}: An Interface for Application-Level Fault
Detection\slash Correction Through Adaptive Redundant
journal = j-INT-J-PARALLEL-PROG,
volume = "46",
number = "2",
pages = "225--251",
month = apr,
year = "2018",
DOI = "https://doi.org/10.1007/s10766-017-0492-3",
ISSN = "0885-7458 (print), 1573-7640 (electronic)",
ISSN-L = "0885-7458",
bibdate = "Fri Oct 11 08:37:50 MDT 2019",
bibsource = "http://link.springer.com/journal/10766/46/2;
acknowledgement = ack-nhfb,
fjournal = "International Journal of Parallel Programming",
journal-URL = "http://link.springer.com/journal/10766",
author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
title = "Decoupled {MapReduce} for Shared-Memory Multi-Core
volume = "17",
number = "2",
pages = "143--146",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2827929",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Modern multi-core processors exhibit high integration
densities, e.g., up to several tens of cores. Multiple
programming frameworks have emerged to facilitate the
development of highly parallel applications. The
MapReduce programming model, after having demonstrated
its usability in the area of distributed computing
systems, has been adapted to the needs of shared-memory
multi-processors showing promising results in
comparison with conventional multi-threaded libraries,
e.g., pthreads. In this paper we enhance the
traditional MapReduce architecture by decoupling the
map and combine phases in order to boost parallel
execution. We show that combiners' memory intensive
features limit the system's degree of parallelism, thus
resulting in sub-optimal hardware utilization, leaving
space for further performance improvements. The
proposed decoupled MapReduce architecture is evaluated
into a NUMA server platform, showing that the adoption
of the De-MapR runtime enables more efficient hardware
utilization and competent run-time improvements. We
demonstrate that the proposed solution achieves
execution speedups of up to 2.46x compared to a
state-of-the-art, shared-memory MapReduce library.",
acknowledgement = ack-nhfb,
affiliation = "Iliakis, K (Reprint Author), Natl Tech Univ Athens,
Zografos 15780, Greece. Iliakis, Konstantinos; Xydis,
Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens,
Zografos 15780, Greece.",
author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr
da = "2019-06-20",
doc-delivery-number = "GP4TI",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "MapReduce; multi-cores; runtime systems",
number-of-cited-references = "13",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
times-cited = "0",
unique-id = "Iliakis:2018:DMS",
web-of-science-categories = "Computer Science, Hardware \&
author = "Bart Jacobs and Dragan Bosnacki and Ruurd Kuiper",
title = "Modular Termination Verification of Single-Threaded
and Multithreaded Programs",
journal = j-TOPLAS,
volume = "40",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2018",
DOI = "https://doi.org/10.1145/3210258",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Thu Oct 18 12:01:50 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We propose an approach for the modular specification
and verification of total correctness properties of
object-oriented programs. The core of our approach is a
specification style that prescribes a way to assign a
level expression to each method such that each callee's
level is below the caller's, even in the presence of
dynamic binding. The specification style yields
specifications that properly hide implementation
details. The main idea is to use multisets of method
names as levels, and to associate with each object
levels that abstractly reflect the way the object is
built from other objects. A method's level is then
defined in terms of the method's own name and the
levels associated with the objects passed as arguments.
We first present the specification style in the context
of programs that do not modify object fields. We then
combine it with separation logic and abstract predicate
families to obtain an approach for programs with heap
mutation. In a third step, we address concurrency, by
incorporating an existing approach for verifying
deadlock freedom of channels and locks. Our main
contribution here is to achieve information hiding by
using the proposed termination levels for lock ordering
as well. Also, we introduce call permissions to enable
elegant verification of termination of programs where
threads cause work in other threads, such as in thread
pools or fine-grained concurrent algorithms involving
compare-and-swap loops. We explain how our approach can
be used also to verify the liveness of nonterminating
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Kari K{\"a}hk{\"o}nen and Keijo Heljanko",
title = "Testing Programs with Contextual Unfoldings",
journal = j-TECS,
volume = "17",
number = "1",
pages = "23:1--23:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/2810000",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:34 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "In this article, we present a new algorithm that
combines contextual unfoldings and dynamic symbolic
execution to systematically test multithreaded
programs. The approach uses symbolic execution to limit
the number of input values and unfoldings to thus limit
the number of thread interleavings that are needed to
cover reachable local states of threads in the program
under test. We show that the use of contextual
unfoldings allows interleavings of threads to be
succinctly represented. This can in some cases lead to
a substantial reduction in the number of needed test
executions when compared to previous approaches.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Orhan Kislal and Jagadish Kotra and Xulong Tang and
Mahmut Taylan Kandemir and Myoungsoo Jung",
title = "Enhancing computation-to-core assignment with physical
location information",
journal = j-SIGPLAN,
volume = "53",
number = "4",
pages = "312--327",
month = apr,
year = "2018",
DOI = "https://doi.org/10.1145/3296979.3192386",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:57 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Going beyond a certain number of cores in modern
architectures requires an on-chip network more scalable
than conventional buses. However, employing an on-chip
network in a manycore system (to improve scalability)
makes the latencies of the data accesses issued by a
core non-uniform. This non-uniformity can play a
significant role in shaping the overall application
performance. This work presents a novel compiler
strategy which involves exposing architecture
information to the compiler to enable an optimized
computation-to-core mapping. Specifically, we propose a
compiler-guided scheme that takes into account the
relative positions of (and distances between) cores,
last-level caches (LLCs) and memory controllers (MCs)
in a manycore system, and generates a mapping of
computations to cores with the goal of minimizing the
on-chip network traffic. The experimental data
collected using a set of 21 multi-threaded applications
reveal that, on an average, our approach reduces the
on-chip network latency in a 6$ \times $6 manycore
system by 38.4\% in the case of private LLCs, and
43.8\% in the case of shared LLCs. These improvements
translate to the corresponding execution time
improvements of 10.9\% and 12.7\% for the private LLC
and shared LLC based systems, respectively.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '18 proceedings.",
author = "Sushant Kondguli and Michael Huang",
title = "{Bootstrapping}: Using {SMT} Hardware to Improve
Single-Thread Performance",
volume = "17",
number = "2",
pages = "205--208",
month = jul # "\slash " # dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2018.2859945",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Decoupled look-ahead (DLA) architectures have been
shown to be an effective way to improve single-thread
performance. However, a default implementation requires
an additional core. While an SMT flavor is possible, a
naive implementation is inefficient and thus slow. In
this paper, we propose an optimized implementation
called Bootstrapping that makes DLA just as effective
on a single (SMT) core as using two cores. While fusing
two cores can improve single-thread performance by
1.23x, Bootstrapping provides a speedup of 1.51.",
acknowledgement = ack-nhfb,
affiliation = "Kondguli, S (Reprint Author), Univ Rochester, Dept
Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli,
Sushant; Huang, Michael, Univ Rochester, Dept Elect \&
Comp Engn, Rochester, NY 14627 USA.",
author-email = "sushant.kondguli@rochester.edu
da = "2019-06-20",
doc-delivery-number = "HA2CO",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1514433, 1533842]",
funding-text = "This work is supported in part by NSF under grants
1514433 and 1533842.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Decoupled look-ahead (DLA) architectures; simultaneous
multi-threading (SMT); single thread performance",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Kondguli:2018:BUS",
web-of-science-categories = "Computer Science, Hardware \&
author = "I-Ting Angelina Lee and Tao B. Schardl",
title = "Efficient Race Detection for Reducer Hyperobjects",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "20:1--20:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3205914",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "A multithreaded Cilk program that is ostensibly
deterministic may nevertheless behave
nondeterministically due to programming errors in the
code. For a Cilk program that uses reducers-a general
reduction mechanism supported in various Cilk
dialects-such programming errors are especially
challenging to debug, because the errors can expose the
nondeterminism in how the Cilk runtime system manages
reducers. We identify two unique types of races that
arise from incorrect use of reducers in a Cilk program,
and we present two algorithms to catch these races. The
first algorithm, called the Peer-Set algorithm, detects
view-read races, which occur when the program attempts
to retrieve a value out of a reducer when the read may
result in a nondeterministic value, such as before all
previously spawned subcomputations that might update
the reducer have necessarily returned. The second
algorithm, called the SP+ algorithm, detects
determinacy races-instances where a write to a memory
location occurs logically in parallel with another
access to that location-even when the raced-on memory
locations relate to reducers. Both algorithms are
provably correct, asymptotically efficient, and can be
implemented efficiently in practice. We have
implemented both algorithms in our prototype race
detector, Rader. When running Peer-Set, Rader incurs a
geometric-mean multiplicative overhead of 2.56 over
running the benchmark without instrumentation. When
running SP+, Rader incurs a geometric-mean
multiplicative overhead of 16.94.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Hongyu Liu and Sam Silvestro and Wei Wang and Chen
Tian and Tongping Liu",
title = "{iReplayer}: in-situ and identical record-and-replay
for multithreaded applications",
journal = j-SIGPLAN,
volume = "53",
number = "4",
pages = "344--358",
month = apr,
year = "2018",
DOI = "https://doi.org/10.1145/3296979.3192380",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Wed Oct 16 14:12:57 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Reproducing executions of multithreaded programs is
very challenging due to many intrinsic and external
non-deterministic factors. Existing RnR systems achieve
significant progress in terms of performance overhead,
but none targets the in-situ setting, in which replay
occurs within the same process as the recording
process. Also, most existing work cannot achieve
identical replay, which may prevent the reproduction of
some errors. This paper presents iReplayer, which aims
to identically replay multithreaded programs in the
original process (under the ``in-situ'' setting). The
novel in-situ and identical replay of iReplayer makes
it more likely to reproduce errors, and allows it to
directly employ debugging mechanisms (e.g. watchpoints)
to aid failure diagnosis. Currently, iReplayer only
incurs 3\% performance overhead on average, which
allows it to be always enabled in the production
environment. iReplayer enables a range of
possibilities, and this paper presents three examples:
two automatic tools for detecting buffer overflows and
use-after-free bugs, and one interactive debugging tool
that is integrated with GDB.",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGPLAN Notices",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706",
remark = "PLDI '18 proceedings.",
author = "Andreas Lochbihler",
title = "Mechanising a Type-Safe Model of Multithreaded {Java}
with a Verified Compiler",
journal = j-J-AUTOM-REASON,
volume = "61",
number = "1--4",
pages = "243--332",
month = jun,
year = "2018",
DOI = "https://doi.org/10.1007/s10817-018-9452-x",
ISSN = "0168-7433 (print), 1573-0670 (electronic)",
ISSN-L = "0168-7433",
bibdate = "Sat Aug 4 07:51:41 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jautomreason.bib;
URL = "http://link.springer.com/article/10.1007/s10817-018-9452-x",
acknowledgement = ack-nhfb,
fjournal = "Journal of Automated Reasoning",
journal-URL = "http://link.springer.com/journal/10817",
author = "Majdi Maabreh and Hafez Irshid and Ajay Gupta and
Izzat Alasmadi",
title = "A multithreading and hashing technique for indexing
{Target--Decoy} peptides databases",
journal = j-CCPE,
volume = "30",
number = "9",
pages = "??--??",
day = "10",
month = may,
year = "2018",
DOI = "https://doi.org/10.1002/cpe.4371",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Sat Aug 4 10:03:13 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
URL = "https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4371",
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
author = "Anton Malakhov and David Liu and Anton Gorshkov and
Terry Wilmarth",
editor = "Fatih Akici and David Lippa and Dillon Niederhut and M
booktitle = "Proceedings of the {17th Python in Science Conference,
Austin, TX, 9--15 July 2018}",
title = "Composable Multi-Threading and Multi-Processing for
Numeric Libraries",
publisher = "????",
address = "????",
pages = "15--21",
year = "2018",
bibdate = "Wed Aug 1 09:03:36 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://conference.scipy.org/proceedings/scipy2018/anton_malakhov.html",
abstract = "Python is popular among scientific communities that
value its simplicity and power, especially as it comes
along with numeric libraries such as NumPy, SciPy,
Dask, and Numba. As CPU core counts keep increasing,
these modules can make use of many cores via
multi-threading for efficient multi-core parallelism.
However, threads can interfere with each other leading
to overhead and inefficiency if used together in a
single application on machines with a large number of
cores. This performance loss can be prevented if all
multi-threaded modules are coordinated. This paper
continues the work started in AMala16 by introducing
more approaches to coordination for both
multi-threading and multi-processing cases. In
particular, we investigate the use of static settings,
limiting the number of simultaneously active OpenMP
parallel regions, and optional parallelism with Intel
Threading Building Blocks (Intel TBB). We will show how
these approaches help to unlock additional performance
for numeric applications on multi-core systems.",
acknowledgement = ack-nhfb,
keywords = "Dask; GIL; Joblib; Multi-core; Multi-processing;
Multi-threading; Nested Parallelism; NumPy; OpenMP;
Oversubscription; Parallel Computations; Python; SciPy;
author = "Stefan K. Muller and Umut A. Acar and Robert Harper",
title = "Competitive parallelism: getting your priorities
journal = j-PACMPL,
volume = "2",
number = "ICFP",
pages = "95:1--95:30",
month = jul,
year = "2018",
DOI = "https://doi.org/10.1145/3236790",
bibdate = "Fri Aug 7 17:44:42 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3236790",
abstract = "Multi-threaded programs have traditionally fallen into
one of two domains: cooperative and competitive. These
two domains have traditionally remained mostly
disjoint, with cooperative threading used for
increasing throughput in compute-intensive \ldots{}",
acknowledgement = ack-nhfb,
articleno = "95",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Binh Pham and Derek Hower and Abhishek Bhattacharjee
and Trey Cain",
title = "{TLB} Shootdown Mitigation for Low-Power Many-Core
Servers with {L1} Virtual Caches",
volume = "17",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2712140",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Power efficiency has become one of the most important
design constraints for high-performance systems. In
this paper, we revisit the design of low-power
virtually-addressed caches. While virtually-addressed
caches enable significant power savings by obviating
the need for Translation Lookaside Buffer (TLB)
lookups, they suffer from several challenging design
issues that curtail their widespread commercial
adoption. We focus on one of these challenges-cache
flushes due to virtual page remappings. We use detailed
studies on an ARM many-core server to show that this
problem degrades performance by up to 25 percent for a
mix of multi-programmed and multi-threaded workloads.
Interestingly, we observe that many of these flushes
are spurious, and caused by an indiscriminate
invalidation broadcast on ARM architecture. In
response, we propose a low-overhead and readily
implementable hardware mechanism using bloom filters to
reduce spurious invalidations and mitigate their ill
acknowledgement = ack-nhfb,
affiliation = "Pham, B (Reprint Author), Rutgers State Univ, Dept
Comp Sci, Piscataway, NJ 08854 USA. Binh Pham;
Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp
Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm
Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey,
Qualcomm Datactr Technol Inc, Piscataway, NJ 08854
author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com
abhib@rutgers.edu tcain@qti.qualcomm.com",
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "multicores; multiprogramming; multithreading; TLB;
Virtual Cache; virtual memory",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Pham:2018:TSM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Dawid Polap and Marcin Wo{\'z}niak and Wei Wei and
Robertas Damasevicius",
title = "Multi-threaded learning control mechanism for neural
journal = j-FUT-GEN-COMP-SYS,
volume = "87",
number = "??",
pages = "16--34",
month = oct,
year = "2018",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Tue Jun 26 08:47:57 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
URL = "https://www.sciencedirect.com/science/article/pii/S0167739X18300931",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Malcolm Roberts and John C. Bowman",
title = "Multithreaded implicitly dealiased convolutions",
journal = j-J-COMPUT-PHYS,
volume = "356",
number = "??",
pages = "98--114",
day = "1",
month = mar,
year = "2018",
ISSN = "0021-9991 (print), 1090-2716 (electronic)",
ISSN-L = "0021-9991",
bibdate = "Sat Jan 13 12:33:11 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0021999117308641",
acknowledgement = ack-nhfb,
fjournal = "Journal of Computational Physics",
journal-URL = "http://www.sciencedirect.com/science/journal/00219991",
author = "Semih Sahin and Bugra Gedik",
title = "{C-Stream}: a Co-routine-Based Elastic Stream
Processing Engine",
journal = j-TOPC,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3184120",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Stream processing is a computational paradigm for
on-the-fly processing of live data. This paradigm lends
itself to implementations that can provide high
throughput and low latency by taking advantage of
various forms of parallelism that are naturally
captured by the stream processing model of computation,
such as pipeline, task, and data parallelism. In this
article, we describe the design and implementation of
C-Stream, which is an elastic stream processing engine.
C-Stream encompasses three unique properties. First, in
contrast to the widely adopted event-based interface
for developing streaming operators, C-Stream provides
an interface wherein each operator has its own driver
loop and relies on data availability application
programming interfaces (APIs) to decide when to perform
its computations. This self-control-based model
significantly simplifies the development of operators
that require multiport synchronization. Second,
C-Stream contains a dynamic scheduler that manages the
multithreaded execution of the operators. The
scheduler, which is customizable via plug-ins, enables
the execution of the operators as co-routines, using
any number of threads. The base scheduler implements
back-pressure, provides data availability APIs, and
manages preemption and termination handling. Last,
C-Stream varies the degree of parallelism to resolve
bottlenecks by both dynamically changing the number of
threads used to execute an application and adjusting
the number of replicas of data-parallel operators. We
provide an experimental evaluation of C-Stream. The
results show that C-Stream is scalable, highly
customizable, and can resolve bottlenecks by
dynamically adjusting the level of data parallelism
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Karthik Sangaiah and Michael Lui and Radhika Jagtap
and Stephan Diestelhorst and Siddharth Nilakantan and
Ankit More and Baris Taskin and Mark Hempstead",
title = "{SynchroTrace}: Synchronization-Aware
Architecture-Agnostic Traces for Lightweight Multicore
Simulation of {CMP} and {HPC} Workloads",
journal = j-TACO,
volume = "15",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3158642",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:19:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Trace-driven simulation of chip multiprocessor (CMP)
systems offers many advantages over execution-driven
simulation, such as reducing simulation time and
complexity, allowing portability, and scalability.
However, trace-based simulation approaches have
difficulty capturing and accurately replaying
multithreaded traces due to the inherent nondeterminism
in the execution of multithreaded programs. In this
work, we present SynchroTrace, a scalable, flexible,
and accurate trace-based multithreaded simulation
methodology. By recording synchronization events
relevant to modern threading libraries (e.g., Pthreads
and OpenMP) and dependencies in the traces, independent
of the host architecture, the methodology is able to
accurately model the nondeterminism of multithreaded
programs for different hardware platforms and threading
paradigms. Through capturing high-level instruction
categories, the SynchroTrace average CPI trace Replay
timing model offers fast and accurate simulation of
many-core in-order CMPs. We perform two case studies to
validate the SynchroTrace simulation flow against the
gem5 full-system simulator: (1) a constraint-based
design space exploration with traditional CMP
benchmarks and (2) a thread-scalability study with
HPC-representative applications. The results from these
case studies show that (1) our trace-based approach
with trace filtering has a peak speedup of up to 18.7$
\times $ over simulation in gem5 full-system with an
average of 9.6$ \times $ speedup, (2) SynchroTrace
maintains the thread-scaling accuracy of gem5 and can
efficiently scale up to 64 threads, and (3)
SynchroTrace can trace in one platform and model any
platform in early stages of design.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Christian Schmitt and Moritz Schmid and Sebastian
Kuckuk and Harald K{\"o}stler and J{\"u}rgen Teich and
Frank Hannig",
title = "Reconfigurable Hardware Generation of Multigrid
Solvers with Conjugate Gradient Coarse-Grid Solution",
volume = "28",
number = "04",
pages = "??--??",
month = dec,
year = "2018",
DOI = "https://doi.org/10.1142/S0129626418500160",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Mon Mar 29 12:30:05 MDT 2021",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626418500160",
abstract = "Not only in the field of high-performance computing
(HPC), field programmable gate arrays (FPGAs) are a
soaringly popular accelerator technology. However, they
use a completely different programming paradigm and
tool set compared to central processing units (CPUs) or
even graphics processing units (GPUs), adding extra
development steps and requiring special knowledge,
hindering widespread use in scientific computing. To
bridge this programmability gap, domain-specific
languages (DSLs) are a popular choice to generate
low-level implementations from an abstract algorithm
description. In this work, we demonstrate our approach
for the generation of numerical solver implementations
based on the multigrid method for FPGAs from the same
code base that is also used to generate code for CPUs
using a hybrid parallelization of MPI and OpenMP. Our
approach yields in a hardware design that can compute
up to 11 V-cycles per second with an input grid size of
4096 {\texttimes} \{\texttimes} {\texttimes} 4096 and
solution on the coarsest using the conjugate gradient
(CG) method on a mid-range FPGA, beating vectorized,
multi-threaded execution on an Intel Xeon processor.",
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Alberto Scionti and Somnath Mazumdar and Stephane
title = "Enabling Massive Multi-Threading with Fast Hashing",
volume = "17",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2017.2697863",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
abstract = "The next generation of high-performance computers is
expected to execute threads in orders of magnitude
higher than today's systems. Improper management of
such huge amount of threads can create resource
contention, leading to overall degraded system
performance. By leveraging more practical approaches to
distribute threads on the available resources,
execution models and manycore chips are expected to
overcome limitations of current systems. Here, we
present DELTA --- a Data-Enabled muLti-Threaded
Architecture, where a producer-consumer scheme is used
to execute threads via complete distributed thread
management mechanism. We consider a manycore tiled-chip
architecture where Network-on-Chip (NoC) routers are
extended to support our execution model. The proposed
extension is analysed, while simulation results confirm
that DELTA can manage a large number of simultaneous
threads, relying on a simple hardware structure.",
acknowledgement = ack-nhfb,
affiliation = "Scionti, A (Reprint Author), ISMB, I-10138 Turin,
Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy.
Mazumdar, Somnath, Univ Siena, Siena, SI, Italy.
Zuckerman, Stephane, Michigan Technol Univ, Houghton,
MI 49931 USA.",
author-email = "scionti@ismb.it mazumdar@dii.unisi.it
da = "2019-06-20",
doc-delivery-number = "FZ6EO",
eissn = "1556-6064",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Dataflow; hashing; network-on-chip;
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Scionti:2018:EMM",
web-of-science-categories = "Computer Science, Hardware \&
author = "Xulong Tang and Mahmut Taylan Kandemir and Hui Zhao
and Myoungsoo Jung and Mustafa Karakoy",
title = "Computing with Near Data",
journal = j-POMACS,
volume = "2",
number = "3",
pages = "42:1--42:30",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3287321",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Mon Mar 29 10:31:29 MDT 2021",
bibsource = "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
URL = "https://dl.acm.org/doi/10.1145/3287321",
abstract = "One cost that plays a significant role in shaping the
overall performance of both single-threaded and
multi-thread applications in modern computing systems
is the cost of moving data between compute elements and
storage elements. Traditional approaches \ldots{}",
acknowledgement = ack-nhfb,
articleno = "42",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "https://dl.acm.org/loi/pomacs",
author = "Lo{\"\i}c Th{\'e}bault and Eric Petit",
title = "Asynchronous and multithreaded communications on
irregular applications using vectorized divide and
conquer approach",
journal = j-J-PAR-DIST-COMP,
volume = "114",
number = "??",
pages = "16--27",
month = apr,
year = "2018",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Tue Feb 6 13:52:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731517303350",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Eryue
Zhuang and Ming Fan and Zijiang Yang",
title = "Reviving Sequential Program Birthmarking for
Multithreaded Software Plagiarism Detection",
volume = "44",
number = "5",
pages = "491--511",
month = may,
year = "2018",
DOI = "https://doi.org/10.1109/TSE.2017.2688383",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Jun 14 08:43:22 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib;
URL = "https://ieeexplore.ieee.org/document/7888597/",
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Brian L. Troutwine",
title = "Hands-on Concurrency with {Rust}: Confidently Build
Memory-safe, Parallel, and Efficient Software in
publisher = pub-PACKT,
address = pub-PACKT:adr,
pages = "v + 449",
year = "2018",
ISBN = "1-78839-997-8 (paperback), 1-78847-835-5",
ISBN-13 = "978-1-78839-997-5 (paperback), 978-1-78847-835-9",
LCCN = "QA76.76.A65",
bibdate = "Tue Dec 10 05:53:29 MST 2019",
bibsource = "fsz3950.oclc.org:210/WorldCat;
URL = "http://proquest.safaribooksonline.com/?fpi=9781788399975",
abstract = "Get to grips with modern software demands by learning
the effective uses of Rust's powerful memory safety.Key
Features Learn and improve the sequential performance
characteristics of your software Understand the use of
operating system processes in a high-scale concurrent
system Learn of the various coordination methods
available in the Standard library. Most programming
languages can really complicate things, especially with
regard to unsafe memory access. The burden on you, the
programmer, lies across two domains: understanding the
modern machine and your language's pain-points. This
book will teach you to how to manage program
performance on modern machines and build fast,
memory-safe, and concurrent software in Rust. It starts
with the fundamentals of Rust and discusses machine
architecture concepts. You will be taken through ways
to measure and improve the performance of Rust code
systematically and how to write collections with
confidence. You will learn about the Sync and Send
traits applied to threads, and coordinate thread
execution with locks, atomic primitives,
data-parallelism, and more.The book will show you how
to efficiently embed Rust in C++ code and explore the
functionalities of various crates for multithreaded
applications. It explores implementations in depth. You
will know how a mutex works and build several yourself.
You will master radically different approaches that
exist in the ecosystem for structuring and managing
high-scale systems. By the end of the book, you will
feel comfortable with designing safe, consistent,
parallel, and high-performance applications in
Rust.What you will learn Probe your programs for
performance and accuracy issues Create your own
threading and multi-processing environment in Rust Use
coarse locks from Rust's Standard library Solve common
synchronization problems or avoid synchronization using
atomic programming Build lock-free/wait-free structures
in Rust and understand their implementations in the
crates ecosystem Leverage Rust's memory model and type
system to build safety properties into your parallel
programs Understand the new features of the Rust
programming language to ease the writing of parallel
programs. Who this book is for. This book is aimed at
software engineers with a basic understanding of Rust
who want to exploit the parallel and concurrent nature
of modern computing environments, safely.",
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
subject = "Application software; Development; Computer
multitasking; Programming languages (Electronic
computers); Portable and handheld devices:
consumer/user guides; Mobile phones: consumer/user
guides; Parallel processing; Programming and scripting
languages: general; Computers; Programming; Parallel;
Hardware; Handheld Devices; Programming Languages; C;
Development; Computer multitasking; Programming
languages (Electronic computers)",
author = "Jui-Hsien Wang and Ante Qu and Timothy R. Langlois and
Doug L. James",
title = "Toward wave-based sound synthesis for computer
journal = j-TOG,
volume = "37",
number = "4",
pages = "109:1--109:??",
month = aug,
year = "2018",
DOI = "https://doi.org/10.1145/3197517.3201318",
ISSN = "0730-0301 (print), 1557-7368 (electronic)",
ISSN-L = "0730-0301",
bibdate = "Thu Nov 29 17:19:43 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "We explore an integrated approach to sound generation
that supports a wide variety of physics-based
simulation models and computer-animated phenomena.
Targeting high-quality offline sound synthesis, we seek
to resolve animation-driven sound radiation with
near-field scattering and diffraction effects. The core
of our approach is a sharp-interface finite-difference
time-domain (FDTD) wavesolver, with a series of
supporting algorithms to handle rapidly deforming and
vibrating embedded interfaces arising in physics-based
animation sound. Once the solver rasterizes these
interfaces, it must evaluate acceleration boundary
conditions (BCs) that involve model-and
phenomena-specific computations. We introduce acoustic
shaders as a mechanism to abstract away these
complexities, and describe a variety of implementations
for computer animation: near-rigid objects with ringing
and acceleration noise, deformable (finite element)
models such as thin shells, bubble-based water, and
virtual characters. Since time-domain wave synthesis is
expensive, we only simulate pressure waves in a small
region about each sound source, then estimate a
far-field pressure signal. To further improve
scalability beyond multi-threading, we propose a fully
time-parallel sound synthesis method that is
demonstrated on commodity cloud computing resources. In
addition to presenting results for multiple animation
phenomena (water, rigid, shells, kinematic deformers,
etc.) we also propose 3D automatic dialogue replacement
(3DADR) for virtual characters so that pre-recorded
dialogue can include character movement, and near-field
shadowing and scattering sound effects.",
acknowledgement = ack-nhfb,
articleno = "109",
fjournal = "ACM Transactions on Graphics",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778",
author = "Parosh Aziz Abdulla and Mohamed Faouzi Atig and Bengt
Jonsson and Magnus L{\aa}ng and Tuan Phong Ngo and
Konstantinos Sagonas",
title = "Optimal stateless model checking for reads-from
equivalence under sequential consistency",
journal = j-PACMPL,
volume = "3",
number = "OOPSLA",
pages = "150:1--150:29",
month = oct,
year = "2019",
DOI = "https://doi.org/10.1145/3360576",
bibdate = "Fri Aug 7 19:22:30 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3360576",
abstract = "We present a new approach for stateless model checking
(SMC) of multithreaded programs under Sequential
Consistency (SC) semantics. To combat state-space
explosion, SMC is often equipped with a partial-order
reduction technique, which defines an \ldots{}",
acknowledgement = ack-nhfb,
articleno = "150",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Patrick R. Amestoy and Alfredo Buttari and Jean-Yves
L'Excellent and Theo Mary",
title = "Performance and Scalability of the Block Low-Rank
Multifrontal Factorization on Multicore Architectures",
journal = j-TOMS,
volume = "45",
number = "1",
pages = "2:1--2:26",
month = mar,
year = "2019",
DOI = "https://doi.org/10.1145/3242094",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Mon May 6 18:23:42 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/citation.cfm?id=3242094",
abstract = "Matrices coming from elliptic Partial Differential
Equations have been shown to have a low-rank property
that can be efficiently exploited in multifrontal
solvers to provide a substantial reduction of their
complexity. Among the possible low-rank formats, the
Block Low-Rank format (BLR) is easy to use in a general
purpose multifrontal solver and its potential compared
to standard (full-rank) solvers has been demonstrated.
Recently, new variants have been introduced and it was
proved that they can further reduce the complexity but
their performance has never been analyzed. In this
article, we present a multithreaded BLR factorization
and analyze its efficiency and scalability in
shared-memory multicore environments. We identify the
challenges posed by the use of BLR approximations in
multifrontal solvers and put forward several
algorithmic variants of the BLR factorization that
overcome these challenges by improving its efficiency
and scalability. We illustrate the performance analysis
of the BLR multifrontal factorization with numerical
experiments on a large set of problems coming from a
variety of real-life applications.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "Miguel Areias and Ricardo Rocha",
title = "Multi-dimensional lock-free arrays for multithreaded
mode-directed tabling in {Prolog}",
journal = j-CCPE,
volume = "31",
number = "5",
pages = "e4491:1--e4491:??",
day = "10",
month = mar,
year = "2019",
DOI = "https://doi.org/10.1002/cpe.4491",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Thu Mar 28 08:07:55 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "30 March 2018",
author = "Esmail Asyabi and Erfan Sharafzadeh and SeyedAlireza
SanaeeKohroudi and Mohsen Sharifi",
title = "{CTS}: an operating system {CPU} scheduler to mitigate
tail latency for latency-sensitive multi-threaded
journal = j-J-PAR-DIST-COMP,
volume = "133",
number = "??",
pages = "232--243",
month = nov,
year = "2019",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Fri Sep 13 10:25:21 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731518302387",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Levente Bajczi and Andr{\'a}s V{\"o}r{\"o}s and Vince
title = "Will My Program Break on This Faulty Processor?:
{Formal} Analysis of Hardware Fault Activations in
Concurrent Embedded Software",
journal = j-TECS,
volume = "18",
number = "5s",
pages = "89:1--89:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3358238",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3358238",
abstract = "Formal verification is approaching a point where it
will be reliably applicable to embedded software. Even
though formal verification can efficiently analyze
multi-threaded applications, multi-core processors are
often considered too dangerous to use in critical
systems, despite the many benefits they can offer. One
reason is the advanced memory consistency model of such
CPUs. Nowadays, most software verifiers assume strict
sequential consistency, which is also the na{\"\i}ve
view of programmers. Modern multi-core processors,
however, rarely guarantee this assumption by default.
In addition, complex processor architectures may easily
contain design faults. Thanks to the recent advances in
hardware verification, these faults are increasingly
visible and can be detected even in existing
processors, giving an opportunity to compensate for the
problem in software. In this paper, we propose a
generic approach to consider inconsistent behavior of
the hardware in the analysis of software. Our approach
is based on formal methods and can be used to detect
the activation of existing hardware faults on the
application level and facilitate their mitigation in
software. The approach relies heavily on recent results
of model checking and hardware verification and offers
new, integrative research directions. We propose a
partial solution based on existing model checking tools
to demonstrate feasibility and evaluate their
performance in this context.",
acknowledgement = ack-nhfb,
articleno = "89",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Jonathan Balkind and Michael McKeown and Yaosheng Fu
and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
Mohammad Shahrad and Adi Fuchs and Samuel Payne and
Xiaohua Liang and Matthew Matl and David Wentzlaff",
title = "{OpenPiton}: an open source hardware platform for your
journal = j-CACM,
volume = "62",
number = "12",
pages = "79--87",
month = dec,
year = "2019",
DOI = "https://doi.org/10.1145/3366343",
ISSN = "0001-0782 (print), 1557-7317 (electronic)",
ISSN-L = "0001-0782",
bibdate = "Mon Nov 25 09:55:53 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cacm2010.bib;
URL = "https://cacm.acm.org/magazines/2019/12/241058/fulltext",
abstract = "Industry is building larger, more complex, manycore
processors on the back of strong institutional
knowledge, but academic projects face difficulties in
replicating that scale. To alleviate these difficulties
and to develop and share knowledge, the community needs
open architecture frameworks for simulation, chip
design, and software exploration that support
extensibility, scalability, and configurability,
alongside an established base of verification tools and
supported software. In this article, we present
OpenPiton, an open source framework for building
scalable architecture research prototypes from one core
to 500 million cores. OpenPiton is the world's first
open source, general-purpose, multithreaded manycore
processor, and framework. OpenPiton is highly
configurable, providing a rich design space spanning a
variety of hardware parameters that researchers can
change. OpenPiton designs can be emulated on FPGAs,
where they can run full-stack multiuser Debian Linux.
OpenPiton is designed to scale to very large core
fabrics, enabling researchers to measure operating
system, compiler, and software scalability. The mature
code-base reflects the complexity of an
industrial-grade design and provides the necessary
scripts to build new chips, making OpenPiton a natural
choice for computer-aided design (CAD) research.
OpenPiton has been validated with a 25-core chip
prototype, named Piton, and is bolstered by a
validation suite that has thousands of tests, providing
an environment to test new hardware designs while
verifying the correctness of the whole system.
OpenPiton is being actively used in research both
internally to Princeton and in the wider community, as
well as being adopted in education, industry, and
government settings.",
acknowledgement = ack-nhfb,
fjournal = "Communications of the ACM",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79",
author = "Paola Bonizzoni and Gianluca Della Vedova and Yuri
Pirola and Marco Previtali and Raffaella Rizzi",
title = "Multithread Multistring {Burrows--Wheeler} Transform
and Longest Common Prefix Array",
journal = j-J-COMPUT-BIOL,
volume = "26",
number = "9",
pages = "948--961",
month = sep,
year = "2019",
DOI = "https://doi.org/10.1089/cmb.2018.0230",
ISSN = "1066-5277 (print), 1557-8666 (electronic)",
ISSN-L = "1066-5277",
bibdate = "Tue Oct 8 06:02:58 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputbiol.bib;
URL = "https://www.liebertpub.com/doi/abs/10.1089/cmb.2018.0230;
acknowledgement = ack-nhfb,
fjournal = "Journal of Computational Biology",
journal-URL = "https://www.liebertpub.com/loi/cmb/",
onlinedate = "29 May 2019",
author = "M. S. M. Bouksiaa and F. Trahay and A. Lescouet and G.
Voron and R. Dulong and A. Guermouche and {\'E}. Brunet
and G. Thomas",
title = "Using Differential Execution Analysis to Identify
Thread Interference",
volume = "30",
number = "12",
pages = "2866--2878",
month = dec,
year = "2019",
DOI = "https://doi.org/10.1109/TPDS.2019.2927481",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Dec 19 09:20:35 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "bottleneck detection; Energy storage; Generators;
multithreading; Performance analysis; Power system
stability; Real-time systems; Renewable energy sources;
Supply and demand",
author = "Hadi Brais and Preeti Ranjan Panda",
title = "{Alleria}: an Advanced Memory Access Profiling
journal = j-TECS,
volume = "18",
number = "5s",
pages = "81:1--81:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3358193",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3358193",
abstract = "Application analysis and simulation tools are used
extensively by embedded system designers to improve
existing optimization techniques or develop new ones.
We propose the Alleria framework to make it easier for
designers to comprehensively collect critical
information such as virtual and physical memory
addresses, accessed values, and thread schedules about
one or more target applications. Such profilers often
incur substantial performance overheads that are orders
of magnitude larger than native execution time. We
discuss how that overhead can be significantly reduced
using a novel profiling mechanism called adaptive
profiling. We develop a heuristic-based adaptive
profiling mechanism and evaluate its performance using
single-threaded and multi-threaded applications. The
proposed technique can improve profiling throughput by
up to 145\% and by 37\% on an average, enabling Alleria
to be used to comprehensively profile applications with
a throughput of over 3 million instructions per
acknowledgement = ack-nhfb,
articleno = "81",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois
and Skyler Windh and Walid A. Najjar and Vassilis J.
title = "Accelerating In-Memory Database Selections Using
Latency Masking Hardware Threads",
journal = j-TACO,
volume = "16",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3310229",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jul 26 14:25:54 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Inexpensive DRAMs have created new opportunities for
in-memory data analytics. However, the major bottleneck
in such systems is high memory access latency.
Traditionally, this problem is solved with large cache
hierarchies that only benefit regular applications.
Alternatively, many data-intensive applications exhibit
irregular behavior. Hardware multithreading can better
cope with high latency seen in such applications. This
article implements a multithreaded prototype (MTP) on
FPGAs for the relational selection operator that
exhibits control flow irregularity. On a standard TPC-H
query evaluation, MTP achieves a bandwidth utilization
of 83\%, while the CPU and the GPU implementations
achieve 61\% and 64\%, respectively. Besides being
bandwidth efficient, MTP is also $ 14.2 \times $ and $
4.2 \times $ more power efficient than CPU and GPU,
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Shane Carroll and Wei-ming Lin",
title = "Applied On-Chip Machine Learning for Dynamic Resource
Control in Multithreaded Processors",
volume = "29",
number = "03",
pages = "??--??",
month = sep,
year = "2019",
DOI = "https://doi.org/10.1142/S0129626419500130",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Mon Mar 29 12:30:09 MDT 2021",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626419500130",
abstract = "In this paper, we propose a machine learning algorithm
to control instruction fetch bandwidth in a
simultaneous multithreaded CPU. In a simultaneous
multithreaded CPU, multiple threads occupy pools of
hardware resources in the same clock cycle. Under some
conditions, one or more threads may undergo a period of
inefficiency, e.g., a cache miss, thereby inefficiently
using shared resources and degrading the performance of
other threads. If these inefficiencies can be
identified at runtime, the offending thread can be
temporarily blocked from fetching new instructions into
the pipeline and given time to recover from its
inefficiency, and prevent the shared system resources
from being wasted on a stalled thread. In this paper,
we propose a machine learning approach to determine
when a thread should be blocked from fetching new
instructions. The model is trained offline and the
parameters embedded in a CPU, which can be queried with
runtime statistics to determine if a thread is running
inefficiently and should be temporarily blocked from
fetching. We propose two models: a simple linear model
and a higher-capacity neural network. We test each
model in a simulation environment and show that system
performance can increase by up to 19\% on average with a
feasible implementation of the proposed algorithm.",
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Shane Carroll and Wei-ming Lin",
title = "Round Robin Thread Selection Optimization in
Multithreaded Processors",
volume = "29",
number = "01",
pages = "??--??",
month = mar,
year = "2019",
DOI = "https://doi.org/10.1142/S0129626419500038",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Mon Mar 29 12:30:06 MDT 2021",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626419500038",
abstract = "We propose a variation of round-robin ordering in an
multi-threaded pipeline to increase system throughput
and resource distribution fairness. We show that using
round robin with a typical arbitrary ordering results
in inefficient use of shared resources and subsequent
thread starvation. To address this but still use a
simple round-robin approach, we optimally and
dynamically sort the order of the round robin
periodically at runtime. We show that with 4-threaded
workloads, throughput can be improved by over 9\% and
harmonic throughput by over 3\% by sorting thread order
at run time. We experiment with multiple stages of the
pipeline and show consistent results throughout several
experiments using the SPEC CPU 2006 benchmarks.
Furthermore, since the technique is still a simple
round robin, the increased performance requires little
overhead to implement.",
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "B. B. Fraguela and D. Andrade",
title = "Easy Dataflow Programming in Clusters with {UPC++}
volume = "30",
number = "6",
pages = "1267--1282",
month = jun,
year = "2019",
DOI = "https://doi.org/10.1109/TPDS.2018.2884716",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Aug 30 06:09:58 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "application program interfaces; arbitrarily complex
task-parallel codes; Arrays; C++ languages; data flow
analysis; dataflow; dataflow approach; distributed
memory; distributed memory systems; easy dataflow
programming; Electronics packaging; host language;
implied uncertainties; interoperability; Libraries;
message passing; multi-threading; multithreading;
parallel processing; parallel programming; parallel
programming models; partitioned global address space
programming model; PGAS libraries; PGAS UPC++ library;
programmability; Programming; Proposals; relevant
proposals; software libraries; Task analysis;
traditional message-passing paradigm; UPC++ DepSpawn",
author = "C. Gueunet and P. Fortin and J. Jomier and J. Tierny",
title = "Task-Based Augmented Contour Trees with {Fibonacci}
volume = "30",
number = "8",
pages = "1889--1905",
month = aug,
year = "2019",
DOI = "https://doi.org/10.1109/TPDS.2019.2898436",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Aug 30 06:09:58 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fibquart.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "computation procedure; contour tree based
applications; Data analysis; data segmentation
applications; data structures; Data structures; data
visualisation; Data visualization; fast shared memory;
Fibonacci heaps; independent local tasks; intermediate
data structures; join split trees; multi-core
architecture; multi-threading; multicore computation;
OpenMP task runtime; parallel algorithm; parallel
algorithms; Parallel algorithms; parallel thanks;
Runtime; Scientific visualization; Task analysis; task
parallelism; task-based augmented contour trees;
topological data analysis; tree algorithm; trees
author = "Vladimir Herdt and Hoang M. Le and Daniel Gro{\ss}e
and Rolf Drechsler",
title = "Combining sequentialization-based verification of
multi-threaded {C} programs with symbolic {Partial
Order Reduction}",
volume = "21",
number = "5",
pages = "545--565",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1007/s10009-019-00507-5",
ISSN = "1433-2779 (print), 1433-2787 (electronic)",
ISSN-L = "1433-2779",
bibdate = "Fri Oct 11 15:05:00 MDT 2019",
bibsource = "http://link.springer.com/journal/10009/21/5;
URL = "https://link.springer.com/article/10.1007/s10009-019-00507-5",
acknowledgement = ack-nhfb,
fjournal = "International Journal on Software Tools for Technology
Transfer (STTT)",
journal-URL = "http://link.springer.com/journal/10009",
author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
title = "{LOOG}: Improving {GPU} Efficiency With Light-Weight
Out-Of-Order Execution",
volume = "18",
number = "2",
pages = "166--169",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2951161",
ISSN = "1556-6064",
ISSN-L = "1556-6056",
bibdate = "Thu May 27 16:19:32 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "GPUs are one of the most prevalent platforms for
accelerating general-purpose workloads due to their
intuitive programming model, computing capacity, and
cost-effectiveness. GPUs rely on massive
multi-threading and fast context switching to overlap
computations with memory operations. Among the diverse
GPU workloads, there exists a class of kernels that
fail to maintain a sufficient number of active warps to
hide the latency of memory operations, and thus suffer
from frequent stalling. We observe that these kernels
will benefit from increased levels of Instruction-Level
Parallelism and we propose a novel architecture with
lightweight Out-Of-Order execution capability. To
minimize hardware overheads, we carefully design our
extension to highly re-use the existing
micro-architectural structures. We show that the
proposed architecture outperforms traditional platforms
by 15 to 46 percent on average for low occupancy
kernels, with an area overhead of 0.74 to 3.94 percent.
Finally, we prove the potential of our proposal as a
GPU u-arch alternative, by providing a 5 percent
speedup over a wide collection of 63 general-purpose
kernels with as little as 0.74 percent area overhead.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Copper; GPGPU; Graphics processing units; Kernel;
micro-architecture; Out of order; Out-of-Order
execution; Radio access technologies; Radio frequency;
author = "Z. Jia and W. Gao and Y. Shi and S. A. McKee and Z. Ji
and J. Zhan and L. Wang and L. Zhang",
title = "Understanding Processors Design Decisions for Data
Analytics in Homogeneous Data Centers",
journal = j-IEEE-TRANS-BIG-DATA,
volume = "5",
number = "1",
pages = "81--94",
month = mar,
year = "2019",
DOI = "https://doi.org/10.1109/TBDATA.2017.2758792",
ISSN = "2332-7790",
bibdate = "Fri Aug 2 11:24:47 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransbigdata.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Big Data",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6687317",
keywords = "Big Data; big data; brawny multicore processors;
Clocks; computational performance; computer centres;
Data analysis; data analysis; Data analytics; data
analytics workloads; data center systems; energy
conservation; energy efficiency; energy-efficiency;
homogeneous data centers; many-core processors;
multi-threading; Multicore processing; multiprocessing
systems; performance; performance-cost efficiency;
Pipelines; power aware computing; processor design
decisions; processor evaluation; Program processors;
simultaneous multithreading",
author = "Steve Klabnik and Carol Nichols",
title = "The {Rust} programming language",
publisher = pub-NO-STARCH,
address = pub-NO-STARCH:adr,
edition = "Second",
pages = "xxix + 526",
year = "2019",
ISBN = "1-09-812253-4, 1-71850-044-0 (paperback)",
ISBN-13 = "978-1-09-812253-9, 978-1-71850-044-0 (paperback)",
LCCN = "QA76.73.R87",
bibdate = "Fri Nov 8 05:59:02 MST 2019",
bibsource = "fsz3950.oclc.org:210/WorldCat;
URL = "http://proquest.safaribooksonline.com/?fpi=9781098122539;
abstract = "\booktitle{The Rust Programming Language} is the
official book on Rust: an open source systems
programming language that helps you write faster, more
reliable software. Rust offers control over low-level
details (such as memory usage) in combination with
high-level ergonomics, eliminating the hassle
traditionally associated with low-level languages. The
authors of \booktitle{The Rust Programming Language},
members of the Rust Core Team, share their knowledge
and experience to show you how to take full advantage
of Rust's features-from installation to creating robust
and scalable programs. You'll begin with basics like
creating functions, choosing data types, and binding
variables and then move on to more advanced concepts,
such as: * Ownership and borrowing, lifetimes, and
traits * Using Rust's memory safety guarantees to build
fast, safe programs; * Testing, error handling, and
effective refactoring; * Generics, smart pointers,
multithreading, trait objects, and advanced pattern
matching; * Using Cargo, Rust's built-in package
manager, to build, test, and document your code and
manage dependencies; * How best to use Rust's advanced
compiler with compiler-led programming techniques
You'll find plenty of code examples throughout the
book, as well as three chapters dedicated to building
complete projects to test your learning: a number
guessing game, a Rust implementation of a command line
tool, and a multithreaded server. New to this edition:
An extended section on Rust macros, an expanded chapter
on modules, and appendixes on Rust development tools
and editions.",
acknowledgement = ack-nhfb,
libnote = "Not in my library.",
subject = "Rust (Computer program language); Computer
programming; Computer programming.; Rust (Computer
program language)",
tableofcontents = "1: Getting started \\
2: Programming a guessing game \\
3: Common programming concepts \\
4: Understanding ownership \\
5: Using structs to structure related data \\
6: Enums and pattern matching \\
7: Managing growing projects with packages, crates, and
modules \\
8: Common collections \\
9: Error handling \\
10: Generic types, traits, and lifetimes \\
11: Writing automated tests \\
12: An I/O project: building a command line program \\
13: Functional language features: iterators and
closures \\
14: More about Cargo and Crates.io \\
15: Smart pointers \\
16: Fearless concurrency \\
17: Object-oriented programming features of Rust \\
18: Patterns and matching \\
19: Advanced features \\
20: Final project: building a multithreaded web server
Appendix A: Keywords \\
Appendix B: Operators and Symbols \\
Appendix C: Derivable Traits \\
Appendix D: Useful Development Tools \\
Appendix E: Editions \\
author = "Ignacio Laguna and Paul C. Wood and Ranvijay Singh and
Saurabh Bagchi",
title = "{GPUMixer}: Performance-Driven Floating-Point Tuning
for {GPU} Scientific Applications",
type = "Report",
institution = "Lawrence Livermore National Laboratory",
address = "Livermore CA 94550, USA",
year = "2019",
bibdate = "Tue Aug 06 05:54:23 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
URL = "http://lagunaresearch.org/docs/isc-2019.pdf;
abstract = "We present GPUMixer, a tool to perform mixed-precision
floating-point tuning on scientific GPU applications.
While precision tuning techniques are available, they
are designed for serial programs and are
accuracy-driven, i.e., they consider configurations
that satisfy accuracy constraints, but these
configurations may degrade performance. GPUMixer, in
contrast, presents a performance-driven approach for
tuning. We introduce a novel static analysis that finds
Fast Imprecise Sets (FISets), sets of operations on low
precision that minimize type conversions, which often
yield performance speedups. To estimate the relative
error introduced by GPU mixed-precision, we propose
shadow computations analysis for GPUs, the first of
this class for multi-threaded applications. GPUMixer
obtains performance improvements of up to 46.4\% of the
ideal speedup in comparison to only 20.7\% found by
state-of-the-art methods.",
acknowledgement = ack-nhfb,
remark = "Best paper award at the 33rd ISC High Performance
conference held June 16--20, 2019.",
author = "Yuxiang Li and Yinliang Zhao and Liyu Sun and Mengjuan
title = "A hybrid sample generation approach in speculative
volume = "75",
number = "8",
pages = "4193--4225",
month = aug,
year = "2019",
DOI = "https://doi.org/10.1007/s11227-017-2118-3",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Thu Oct 10 15:31:22 MDT 2019",
bibsource = "http://link.springer.com/journal/11227/75/8;
acknowledgement = ack-nhfb,
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Y. Li and K. Nomura and J. A. Insley and V. Morozov
and K. Kumaran and N. A. Romero and W. A. Goddard and
R. K. Kalia and A. Nakano and P. Vashishta",
title = "Scalable Reactive Molecular Dynamics Simulations for
Computational Synthesis",
journal = j-COMPUT-SCI-ENG,
volume = "21",
number = "5",
pages = "64--75",
month = sep,
year = "2019",
DOI = "https://doi.org/10.1109/MCSE.2018.110150043",
ISSN = "1521-9615 (print), 1558-366x (electronic)",
ISSN-L = "1521-9615",
bibdate = "Mon Aug 19 06:40:58 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
acknowledgement = ack-nhfb,
fjournal = "Computing in Science and Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
keywords = "Computational modeling; Computer science; computer
system implementation mathematics of computing;
computing methodologies; data; general; large and
medium ( mainframe ) computers; Materials science and
technology; Mathematical model; modeling and
prediction; Multithreading; numerical analysis;
Numerical models; operating systems; parallel
algorithms; performance; Predictive models; simulation
theory; simulation, modeling, and visualization;
software; software engineering; super (very large)
computers; system applications and experience; theory
of computation; types of simulation",
author = "Bing Li and Mengjie Mao and Xiaoxiao Liu and Tao Liu
and Zihao Liu and Wujie Wen and Yiran Chen and Hai
(Helen) Li",
title = "Thread Batching for High-performance Energy-efficient
{GPU} Memory Design",
journal = j-JETC,
volume = "15",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3330152",
ISSN = "1550-4832",
bibdate = "Tue Dec 17 07:50:24 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3330152",
abstract = "Massive multi-threading in GPU imposes tremendous
pressure on memory subsystems. Due to rapid growth in
thread-level parallelism of GPU and slowly improved
peak memory bandwidth, memory becomes a bottleneck of
GPU's performance and energy efficiency. In this
article, we propose an integrated architectural scheme
to optimize the memory accesses and therefore boost the
performance and energy efficiency of GPU. First, we
propose a thread batch enabled memory partitioning
(TEMP) to improve GPU memory access parallelism. In
particular, TEMP groups multiple thread blocks that
share the same set of pages into a thread batch and
applies a page coloring mechanism to bound each stream
multiprocessor (SM) to the dedicated memory banks.
After that, TEMP dispatches the thread batch to an SM
to ensure high-parallel memory-access streaming from
the different thread blocks. Second, a thread
batch-aware scheduling (TBAS) scheme is introduced to
improve the GPU memory access locality and to reduce
the contention on memory controllers and
interconnection networks. Experimental results show
that the integration of TEMP and TBAS can achieve up to
10.3\% performance improvement and 11.3\% DRAM energy
reduction across diverse GPU applications. We also
evaluate the performance interference of the mixed
CPU+GPU workloads when they are run on a heterogeneous
system that employs our proposed schemes. Our results
show that a simple solution can effectively ensure the
efficient execution of both GPU and CPU applications.",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
author = "Vladimir Mironov and Yuri Alexeev and Dmitri G.
title = "Multithreaded parallelization of the energy and
analytic gradient in the fragment molecular orbital
journal = j-IJQC,
volume = "119",
number = "12",
pages = "e25937:1--e25937:??",
day = "15",
month = jun,
year = "2019",
DOI = "https://doi.org/10.1002/qua.25937",
ISSN = "0020-7608 (print), 1097-461X (electronic)",
ISSN-L = "0020-7608",
bibdate = "Wed Oct 9 06:14:07 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ijqc2010.bib;
acknowledgement = ack-nhfb,
fjournal = "International Journal of Quantum Chemistry",
journal-URL = "http://www.interscience.wiley.com/jpages/0020-7608/",
onlinedate = "26 April 2019",
author = "Isil Oz and Sanem Arslan",
title = "A Survey on Multithreading Alternatives for Soft Error
Fault Tolerance",
journal = j-COMP-SURV,
volume = "52",
number = "2",
pages = "27:1--27:??",
month = may,
year = "2019",
DOI = "https://doi.org/10.1145/3302255",
ISSN = "0360-0300 (print), 1557-7341 (electronic)",
ISSN-L = "0360-0300",
bibdate = "Sat Aug 31 09:04:37 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compsurv.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3302255",
abstract = "Smaller transistor sizes and reduction in voltage
levels in modern microprocessors induce higher soft
error rates. This trend makes reliability a primary
design constraint for computer systems. Redundant
multithreading (RMT) makes use of parallelism in modern
systems by employing thread-level time redundancy for
fault detection and recovery. RMT can detect faults by
running identical copies of the program as separate
threads in parallel execution units with identical
inputs and comparing their outputs. In this article, we
present a survey of RMT implementations at different
architectural levels with several design
considerations. We explain the implementations in
seminal papers and their extensions and discuss the
design choices employed by the techniques. We review
both hardware and software approaches by presenting the
main characteristics and analyze the studies with
different design choices regarding their strengths and
weaknesses. We also present a classification to help
potential users find a suitable method for their
requirement and to guide researchers planning to work
on this area by providing insights into the future
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Computing Surveys",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204",
author = "{\'A}goston R{\'o}th",
title = "Algorithm 992: An {OpenGL}- and {C++}-based Function
Library for Curve and Surface Modeling in a Large Class
of Extended {Chebyshev} Spaces",
journal = j-TOMS,
volume = "45",
number = "1",
pages = "13:1--13:32",
month = mar,
year = "2019",
DOI = "https://doi.org/10.1145/3284979",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Mon May 6 18:23:42 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/citation.cfm?id=3284979",
abstract = "We propose a platform-independent multi-threaded
function library that provides data structures to
generate, differentiate, and render both the ordinary
basis and the normalized B-basis of a user-specified
extended Chebyshev (EC) space that comprises the
constants and can be identified with the solution space
of a constant-coefficient homogeneous linear
differential equation defined on a sufficiently small
interval. Using the obtained normalized B-bases, our
library can also generate, (partially) differentiate,
modify, and visualize a large family of so-called
B-curves and tensor product B-surfaces. Moreover, the
library also implements methods that can be used to
perform dimension elevation, to subdivide B-curves and
B-surfaces by means of de Casteljau-like B-algorithms,
and to generate basis transformations for the
B-representation of arbitrary integral curves and
surfaces that are described in traditional parametric
form by means of the ordinary bases of the underlying
EC spaces. Independently of the algebraic, exponential,
trigonometric, or mixed type of the applied EC space,
the proposed library is numerically stable and
efficient up to a reasonable dimension number and may
be useful for academics and engineers in the fields of
Approximation Theory, Computer Aided Geometric Design,
Computer Graphics, and Isogeometric and Numerical
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J782",
author = "J. M. Sabarimuthu and T. G. Venkatesh",
title = "Analytical Derivation of Concurrent Reuse Distance
Profile for Multi-Threaded Application Running on Chip
volume = "30",
number = "8",
pages = "1704--1721",
month = aug,
year = "2019",
DOI = "https://doi.org/10.1109/TPDS.2019.2896633",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Fri Aug 30 06:09:58 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "analytical model; analytical model based reuse
distance prediction; Analytical models; cache memory
design space; cache performance; cache storage;
coherent reuse distance profile; compiler optimization;
Complexity theory; concurrent reuse distance;
concurrent reuse distance profile; Histograms;
Instruction sets; locality analysis; Markov chain;
Markov processes; Measurement; microprocessor chips;
multi-core processors; multi-threaded applications;
multi-threading; multicore simulator Sniper;
multiprocessing systems; multithreaded application;
optimisation; Performance analysis; performance
analysis; probability; probability theory; Reuse
distance profile; shared memory environment;
simulation; standalone reuse distance profile; thread
author = "Avik Sengupta",
title = "{Julia} high performance optimizations, distributed
computing, multithreading, and {GPU} programming with
{Julia 1.0} and beyond",
publisher = pub-PACKT,
address = pub-PACKT:adr,
edition = "Second",
pages = "218",
year = "2019",
ISBN = "1-78829-230-8, 1-78829-811-X",
ISBN-13 = "978-1-78829-230-6, 978-1-78829-811-7",
LCCN = "????",
bibdate = "Thu Apr 8 16:49:31 MDT 2021",
bibsource = "fsz3950.oclc.org:210/WorldCat;
URL = "http://portal.igpublish.com/iglibrary/search/PACKT0005341.html",
abstract = "Julia is a high-level, high-performance dynamic
programming language for numerical computing. This book
will help you understand the performance
characteristics of your Julia programs and achieve
near-C levels of performance in Julia.",
acknowledgement = ack-nhfb,
subject = "Julia (Computer program language); Application
software; Development; Development.; Julia (Computer
program language)",
tableofcontents = "Foreword \\
Contributors \\
Table of Contents \\
Preface \\
1: Julia is Fast \\
Julia \\
fast and dynamic \\
Designed for speed \\
JIT and LLVM \\
Types, type inference, and code specialization \\
How fast can Julia be? \\
Summary \\
2: Analyzing Performance \\
Timing Julia functions \\
The @time macro \\
Other time macros \\
The Julia profiler \\
Using the profiler \\
ProfileView \\
Using Juno for profiling \\
Using TimerOutputs \\
Analyzing memory allocation \\
Using the memory allocation tracker \\
Statistically accurate benchmarking \\
Using \pkg{BenchmarkTools.jl} \\
Summary \\
3: Types, Type Inference, and Stability \\
The Julia type system \\
Using types \\
Multiple dispatch \\
Abstract types \\
Julia's type hierarchy \\
Composite and immutable types \\
Type parameters \\
Type inference \\
Type-stability \\
Definitions \\
Fixing type instability \\
The performance pitfalls \\
Identifying type stability \\
Loop variables \\
Kernel methods and function barriers \\
Types in storage locations \\
Arrays \\
Composite types \\
Parametric composite types \\
Summary \\
4: Making Fast Function Calls \\
Using globals \\
The trouble with globals \\
Fixing performance issues with globals \\
Inlining \\
Default inlining \\
Controlling inlining \\
Disabling inlining \\
Constant propagation \\
Using macros for performance \\
The Julia compilation process \\
Using macros \\
Evaluating a polynomial \\
Horner's method \\
The Horner macro \\
Generated functions \\
Using generated functions \\
Using generated functions for performance \\
Using keyword arguments \\
Summary \\
5: Fast Numbers \\
Numbers in Julia, their layout, and storage \\
Integers \\
Integer overflow \\
BigInt \\
The floating point \\
Floating point accuracy \\
Unsigned integers \\
Trading performance for accuracy \\
The @fastmath macro \\
The K-B-N summation \\
Subnormal numbers \\
Subnormal numbers to zero \\
Summary \\
6: Using Arrays \\
Array internals in Julia \\
Array representation and storage \\
Column-wise storage \\
Adjoints \\
Array initialization \\
Bounds checking \\
Removing the cost of bounds checking \\
Configuring bound checks at startup \\
Allocations and in-place operations \\
Preallocating function output \\
sizehint! \\
Mutating functions \\
Broadcasting \\
Array views \\
SIMD parallelization (AVX2, AVX512) \\
SIMD.jl \\
Specialized array types \\
Static arrays \\
Structs of arrays \\
Yeppp!Writing generic library functions with arrays \\
Summary \\
7: Accelerating Code with the GPU \\
Technical requirements \\
Getting started with GPUs \\
CUDA and Julia \\
CuArrays \\
Monte Carlo simulation on the GPU \\
Writing your own kernels \\
Measuring GPU performance \\
Performance tips \\
Scalar iteration \\
Combining kernels \\
Processing more data \\
Deep learning on the GPU \\
ArrayFire \\
Summary \\
8: Concurrent Programming with Tasks \\
Tasks \\
Using tasks \\
The task life cycle \\
task\_local\_storage \\
Communicating between tasks \\
Task iteration \\
High-performance I/O",
author = "Colin Shea and Tinoosh Mohsenin",
title = "Heterogeneous Scheduling of Deep Neural Networks for
Low-power Real-time Designs",
journal = j-JETC,
volume = "15",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3358699",
ISSN = "1550-4832",
bibdate = "Tue Dec 17 07:50:24 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3358699",
abstract = "Deep neural networks have become the readiest answer
to a range of application challenges including image
recognition, stock analysis, natural language
processing, and biomedical applications such as seizure
detection. All while outperforming prior leading
solutions that relied heavily on hand-engineered
techniques. However, deployment of these neural
networks often requires high-computational and
memory-intensive solutions. These requirements make it
challenging to deploy Deep Neural Networks (DNNs) in
embedded, real-time low-power applications where
classic architectures, GPUs and CPUs, still impose
significant power burden. Systems-on-Chip (SoC) with
Field-programmable Gate Arrays (FPGAs) can be used to
improve performance and allow more fine-grain control
of resources than CPUs or GPUs, but it is difficult to
find the optimal balance between hardware and software
to improve DNN efficiency. In the current research
literature there have been few proposed solutions to
address optimizing hardware and software deployments of
DNNs in embedded low-power systems. To address the
computation resource restriction and low-power needs
for deploying these networks, we describe and implement
a domain-specific metric model for optimizing task
deployment on differing platforms, hardware and
software. Next, we propose a DNN hardware accelerator
called Scalable Low-power Accelerator for real-time
deep neural Networks (SCALENet) that includes
multithreaded software workers. Finally, we propose a
heterogeneous aware scheduler that uses the
DNN-specific metric models and the SCALENet accelerator
to allocate a task to a resource based on solving a
numerical cost for a series of domain objectives. To
demonstrate the applicability of our contribution, we
deploy nine modern deep network architectures, each
containing a different number of parameters within the
context of two different neural network applications:
image processing and biomedical seizure detection.
Utilizing the metric modeling techniques integrated
into the heterogeneous aware scheduler and the SCALENet
accelerator, we demonstrate the ability to meet
computational requirements, adapt to multiple
architectures, and lower power by providing an
optimized task to resource allocation. Our
heterogeneous aware scheduler improves power saving by
decreasing power consumption by 10\% of the total
system power, does not affect the accuracy of the
networks, and still meets the real-time deadlines. We
demonstrate the ability to achieve parity with or
exceed the energy efficiency of NVIDIA GPUs when
evaluated against Jetson TK1 with embedded GPU SoC and
with a 4$ \times $ power savings in a power envelope of
2.0W. When compared to existing FPGA-based
accelerators, SCALENet's accelerator and heterogeneous
aware scheduler achieves a 4$ \times $ improvement in
energy efficiency.",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
author = "G. Shomron and T. Horowitz and U. Weiser",
title = "{SMT-SA}: Simultaneous Multithreading in Systolic
volume = "18",
number = "2",
pages = "99--102",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2924007",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Systolic arrays (SAs) are highly parallel pipelined
structures capable of executing various tasks such as
matrix multiplication and convolution. They comprise a
grid of usually homogeneous processing units (PUs) that
are responsible for the multiply-accumulate (MAC)
operations in the case of matrix multiplication. It is
not rare for a PU input to be zero-valued, in which
case the PU becomes idle and the array becomes
underutilized. In this paper we consider a solution to
employ the underutilized PUs via simultaneous
multithreading (SMT). We explore the design space of a
SMT-SA variant and evaluate its performance, area
efficiency, and energy consumption. In addition, we
suggest a tiling method to reduce area overheads. Our
evaluation shows that a 4-thread FP16-based SMT-SA
achieves speedups of up to $ 3.6 \times $ as compared
to conventional SA, with $ 1.7 \times $ area overhead
and negligible energy overhead.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "4-thread FP16-based SMT-SA; area efficiency;
Convolution; Correlation; Deep learning; Energy
consumption; energy consumption; homogeneous processing
units; Instruction sets; matrix multiplication;
multi-threading; multiply-accumulate operations;
Multithreading; multithreading; parallel pipelined
structures; PU input; simultaneous multithreading;
SMT-SA variant; Systolic arrays; systolic arrays; Task
author = "Lucas Bragan{\c{c}}a {Da Silva} and Ricardo Ferreira
and Michael Canesche and Marcelo M. Menezes and Maria
D. Vieira and Jeronimo Penha and Peter Jamieson and
Jos{\'e} Augusto M. Nacif",
title = "{READY}: a Fine-Grained Multithreading Overlay
Framework for Modern {CPU--FPGA} Dataflow
journal = j-TECS,
volume = "18",
number = "5s",
pages = "56:1--56:??",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3358187",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3358187",
abstract = "In this work, we propose a framework called
REconfigurable Accelerator DeploY (READY), the first
framework to support polynomial runtime mapping of
dataflow applications in high-performance CPU-FPGA
platforms. READY introduces an efficient mapping with
fine-grained multithreading onto an overlay
architecture that hides the latency of a global
interconnection network. In addition to our overlay
architecture, we show how this system helps solve some
of the challenges for FPGA cloud computing adoption in
high-performance computing. The framework encapsulates
dataflow descriptions by using a target independent,
high-level API, and a dataflow model that allows for
explicit spatial and temporal parallelism. READY
directly maps the dataflow kernels onto the
accelerator. Our tool is flexible and extensible and
provides the infrastructure to explore different
accelerator designs. We validate READY on the Intel
Harp platform, and our experimental results show an
average 2x execution runtime improvement when compared
to an 8-thread multi-core processor.",
acknowledgement = ack-nhfb,
articleno = "56",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Fausto Spoto and Elisa Burato and Michael D. Ernst and
Pietro Ferrara and Alberto Lovato and Damiano Macedonio
and Ciprian Spiridon",
title = "Static Identification of Injection Attacks in {Java}",
journal = j-TOPLAS,
volume = "41",
number = "3",
pages = "18:1--18:??",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1145/3332371",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Sat Nov 23 07:18:02 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3332371",
abstract = "The most dangerous security-related software errors,
according to the OWASP Top Ten 2017 list, affect web
applications. They are potential injection attacks that
exploit user-provided data to execute undesired
operations: database access and updates ( SQL injection
); generation of malicious web pages ( cross-site
scripting injection ); redirection to user-specified
web pages ( redirect injection ); execution of OS
commands and arbitrary scripts ( command injection );
loading of user-specified, possibly heavy or dangerous
classes at run time ( reflection injection ); access to
arbitrary files on the file system ( path-traversal );
and storing user-provided data into heap regions
normally assumed to be shielded from the outside world
( trust boundary violation ). All these attacks exploit
the same weakness: unconstrained propagation of data
from sources that the user of a web application
controls into sinks whose activation might trigger
dangerous operations. Although web applications are
written in a variety of languages, Java remains a
frequent choice, in particular for banking
applications, where security has tangible relevance.
This article defines a unified, sound protection
mechanism against such attacks, based on the
identification of all possible explicit flows of
tainted data in Java code. Such flows can be
arbitrarily complex, passing through dynamically
allocated data structures in the heap. The analysis is
based on abstract interpretation and is
interprocedural, flow-sensitive, and context-sensitive.
Its notion of taint applies to reference
(non-primitive) types dynamically allocated in the heap
and is object-sensitive and field-sensitive. The
analysis works by translating the program into Boolean
formulas that model all possible data flows. Its
implementation, within the Julia analyzer for Java and
Android, found injection security vulnerabilities in
the Internet banking service and in the customer
relationship management of large Italian banks, as well
as in a set of open-source third-party applications. It
found the command injection, which is at the origin of
the 2017 Equifax data breach, one of the worst data
breaches ever. For objective, repeatable results, this
article also evaluates the implementation on two
open-source security benchmarks: the Juliet Suite and
the OWASP Benchmark for the automatic comparison of
static analyzers for cybersecurity. We compared this
technique against more than 10 other static analyzers,
both free and commercial. The result of these
experiments is that ours is the only analysis for
injection that is sound (up to well-stated limitations
such as multithreading and native code) and works on
industrial code, and it is also much more precise than
other tools.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783",
author = "Kyle Storey and Eric Mercer and Pavel Parizek",
title = "A Sound Dynamic Partial Order Reduction Engine for
{Java Pathfinder}",
journal = j-SIGSOFT,
volume = "44",
number = "4",
pages = "15--15",
month = dec,
year = "2019",
DOI = "https://doi.org/10.1145/3364452.3364457",
ISSN = "0163-5948 (print), 1943-5843 (electronic)",
ISSN-L = "0163-5948",
bibdate = "Wed Mar 24 14:07:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
URL = "https://dl.acm.org/doi/10.1145/3364452.3364457",
abstract = "When model checking a multi-threaded program, it is
often necessary to enumerate the possible ordering of
concurrent events to evaluate the behavior of the
program. However, enumerating every possible order of
events quickly leads to state-space \ldots{}",
acknowledgement = ack-nhfb,
fjournal = "ACM SIGSOFT Software Engineering Notes",
journal-URL = "https://dl.acm.org/loi/sigsoft",
author = "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang
and Jingling Xue",
title = "{SCP}: Shared Cache Partitioning for High-Performance
journal = j-TACO,
volume = "15",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3274654",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Jan 8 17:20:00 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3274654",
abstract = "GEneral Matrix Multiply (GEMM) is the most fundamental
computational kernel routine in the BLAS library. To
achieve high performance, in-memory data must be
prefetched into fast on-chip caches before they are
used. Two techniques, software prefetching and data
packing, have been used to effectively exploit the
capability of on-chip least recent used (LRU) caches,
which are popular in traditional high-performance
processors used in high-end servers and supercomputers.
However, the market has recently witnessed a new
diversity in processor design, resulting in
high-performance processors equipped with shared caches
with non-LRU replacement policies. This poses a
challenge to the development of high-performance GEMM
in a multithreaded context. As several threads try to
load data into a shared cache simultaneously,
interthread cache conflicts will increase
significantly. We present a Shared Cache Partitioning
(SCP) method to eliminate interthread cache conflicts
in the GEMM routines, by partitioning a shared cache
into physically disjoint sets and assigning different
sets to different threads. We have implemented SCP in
the OpenBLAS library and evaluated it on Phytium 2000+,
a 64-core AArch64 processor with private LRU L1 caches
and shared pseudo-random L2 caches (per four-core
cluster). Our evaluation shows that SCP has effectively
reduced the conflict misses in both L1 and L2 caches in
a highly optimized GEMM implementation, resulting in an
improvement of its performance by 2.75\% to 6.91\%.",
acknowledgement = ack-nhfb,
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924",
author = "Robert Utterback and Kunal Agrawal and I-Ting Angelina
Lee and Milind Kulkarni",
title = "Processor-Oblivious Record and Replay",
journal = j-TOPC,
volume = "6",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365659",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Dec 27 16:13:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3365659",
abstract = "Record-and-replay systems are useful tools for
debugging non-deterministic parallel programs by first
recording an execution and then replaying that
execution to produce the same access pattern. Existing
record-and-replay systems generally target thread-based
execution models, and record the behaviors and
interleavings of individual threads. Dynamic
multithreaded languages and libraries, such as the Cilk
family, OpenMP, TBB, and the like, do not have a notion
of threads. Instead, these languages provide a
processor-oblivious model of programming, where
programs expose task parallelism using high-level
constructs such as spawn/sync without regard to the
number of threads/cores available to run the program.
Thread-based record-and-replay would violate the
processor-oblivious nature of these programs, as they
incorporate the number of threads into the recorded
information, constraining the replayed execution to the
same number of threads. In this article, we present a
processor-oblivious record-and-replay scheme for
dynamic multithreaded languages where record and replay
can use different number of processors and both are
scheduled using work stealing. We provide theoretical
guarantees for our record and replay scheme-namely that
record is optimal for programs with one lock and replay
is near-optimal for all cases. In addition, we
implemented this scheme in the Cilk Plus runtime system
and our evaluation indicates that
processor-obliviousness does not cause substantial
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Vanchinathan Venkataramani and Mun Choon Chan and
Tulika Mitra",
title = "Scratchpad-Memory Management for Multi-Threaded
Applications on Many-Core Architectures",
journal = j-TECS,
volume = "18",
number = "1",
pages = "10:1--10:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301308",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:42 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3301308",
abstract = "Contemporary many-core architectures, such as Adapteva
Epiphany and Sunway TaihuLight, employ per-core
software-controlled Scratchpad Memory (SPM) rather than
caches for better performance-per-watt and
predictability. In these architectures, a core is
allowed to access its own SPM as well as remote SPMs
through the Network-On-Chip (NoC). However, the
compiler/programmer is required to explicitly manage
the movement of data between SPMs and off-chip memory.
Utilizing SPMs for multi-threaded applications is even
more challenging, as the shared variables across the
threads need to be placed appropriately. Accessing
variables from remote SPMs with higher access latency
further complicates this problem as certain links in
the NoC may be heavily contended by multiple threads.
Therefore, certain variables may need to be replicated
in multiple SPMs to reduce the contention delay and/or
the overall access time. We present Coordinated Data
Management (CDM), a compile-time framework that
automatically identifies shared/private variables and
places them with replication (if necessary) to suitable
on-chip or off-chip memory, taking NoC contention into
consideration. We develop both an exact Integer Linear
Programming (ILP) formulation as well as an iterative,
scalable algorithm for placing the data variables in
multi-threaded applications on many-core SPMs.
Experimental evaluation on the Parallella hardware
platform confirms that our allocation strategy reduces
the overall execution time and energy consumption by $
1.84 \times $ and $ 1.83 \times $, respectively, when
compared to the existing approaches.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "L. Wang and M. Jahre and A. Adileh and Z. Wang and L.
title = "Modeling Emerging Memory-Divergent {GPU}
volume = "18",
number = "2",
pages = "95--98",
month = jul,
year = "2019",
DOI = "https://doi.org/10.1109/LCA.2019.2923618",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Oct 1 10:18:16 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
abstract = "Analytical performance models yield valuable
architectural insight without incurring the excessive
runtime overheads of simulation. In this work, we study
contemporary GPU applications and find that the key
performance-related behavior of such applications is
distinct from traditional GPU applications. The key
issue is that these GPU applications are
memory-intensive and have poor spatial locality, which
implies that the loads of different threads commonly
access different cache blocks. Such memory-divergent
applications quickly exhaust the number of misses the
L1 cache can process concurrently, and thereby cripple
the GPU's ability to use Memory-Level Parallelism (MLP)
and Thread-Level Parallelism (TLP) to hide memory
latencies. Our Memory Divergence Model (MDM) is able to
accurately represent this behavior and thereby reduces
average performance prediction error by $ 14 \times $
compared to the state-of-the-art GPUMech approach
across our memory-divergent applications.",
acknowledgement = ack-nhfb,
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; analytical performance models;
Analytical performance prediction; average performance
prediction error; cache blocks; cache storage;
Computational modeling; contemporary GPU applications;
GPU; graphics processing units; Graphics processing
units; Instruction sets; key performance-related
behavior; L1 cache; Mathematical model; memory
architecture; memory divergence model; memory
latencies; memory-divergent applications;
memory-divergent GPU applications; memory-intensive;
memory-level parallelism; multi-threading;
multiprocessing systems; Predictive models; Random
access memory; thread-level parallelism; traditional
GPU applications; valuable architectural insight",
author = "Wenlu Wang and Ji Zhang and Min-Te Sun and Wei-Shinn
title = "A scalable spatial skyline evaluation system utilizing
parallel independent region groups",
journal = j-VLDB-J,
volume = "28",
number = "1",
pages = "73--98",
month = feb,
year = "2019",
DOI = "https://doi.org/10.1007/s00778-018-0519-4",
ISSN = "1066-8888 (print), 0949-877X (electronic)",
ISSN-L = "1066-8888",
bibdate = "Tue Feb 5 08:07:20 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "This research presents two parallel solutions to
efficiently address spatial skyline queries. First, we
propose a novel concept called independent regions for
parallelizing the process of spatial skyline
evaluation. Spatial skyline candidates in an
independent region do not depend on any data point in
other independent regions. Then, we propose a GPU-based
solution. We use multi-level independent region
group-based parallel filter to support efficient
multi-threading spatial skyline non-candidate
elimination. Beyond that, we propose comparable region
to accelerate non-candidate elimination in each
independent region. Secondly, we propose a
MapReduce-based solution. We generate the convex hull
of query points in the first MapReduce phase. In the
second phase, we calculate independent regions based on
the input data points and the convex hull of the query
points. With the independent regions, spatial skylines
are evaluated in parallel in the third phase, in which
data points are partitioned by their associated
independent regions in map functions, and spatial
skyline candidates are calculated by reduce functions.
The results of the spatial skyline queries are the
union of outputs from the reduce functions. Our
experimental results show that GPU multi-threading
scheme is very efficient on small-scale input datasets.
On the contrary, MapReduce scheme performs very well on
large-scale input datasets.",
acknowledgement = ack-nhfb,
fjournal = "VLDB Journal: Very Large Data Bases",
journal-URL = "http://portal.acm.org/toc.cfm?id=J869",
author = "Conrad Watt and Andreas Rossberg and Jean
title = "Weakening {WebAssembly}",
journal = j-PACMPL,
volume = "3",
number = "OOPSLA",
pages = "133:1--133:28",
month = oct,
year = "2019",
DOI = "https://doi.org/10.1145/3360559",
bibdate = "Fri Aug 7 19:22:30 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3360559",
abstract = "WebAssembly (Wasm) is a safe, portable virtual
instruction set that can be hosted in a wide range of
environments, such as a Web browser. It is a low-level
language whose instructions are intended to compile
directly to bare hardware. While the initial version of
Wasm focussed on single-threaded computation, a recent
proposal extends it with low-level support for multiple
threads and atomic instructions for synchronised access
to shared memory. To support the correct compilation of
concurrent programs, it is necessary to give a suitable
specification of its memory model.\par
Wasm's language definition is based on a fully
formalised specification that carefully avoids
undefined behaviour. We present a substantial extension
to this semantics, incorporating a relaxed memory
model, along with a few proposed extensions. Wasm's
memory model is unique in that its linear address space
can be dynamically grown during execution, while all
accesses are bounds-checked. This leads to the novel
problem of specifying how observations about the size
of the memory can propagate between threads. We argue
that, considering desirable compilation schemes, we
cannot give a sequentially consistent semantics to
memory growth.\par
We show that our model provides sequential consistency
for data-race-free executions (SC-DRF). However,
because Wasm is to run on the Web, we must also
consider interoperability of its model with that of
JavaScript. We show, by counter-example, that
JavaScript's memory model is not SC-DRF, in contrast to
what is claimed in its specification. We propose two
axiomatic conditions that should be added to the
JavaScript model to correct this difference.\par
We also describe a prototype SMT-based litmus tool
which acts as an oracle for our axiomatic model,
visualising its behaviours, including memory
acknowledgement = ack-nhfb,
articleno = "133",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Jimmy Ming-Tai Wu and Jerry Chun-Wei Lin and Ashish
title = "High-Utility Itemset Mining with Effective Pruning
journal = j-TKDD,
volume = "13",
number = "6",
pages = "58:1--58:??",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3363571",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Dec 18 14:31:03 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3363571",
abstract = "High-utility itemset mining is a popular data mining
problem that considers utility factors, such as
quantity and unit profit of items besides frequency
measure from the transactional database. It helps to
find the most valuable and profitable products/items
that are difficult to track by using only the frequent
itemsets. An item might have a high-profit value which
is rare in the transactional database and has a
tremendous importance. While there are many existing
algorithms to find high-utility itemsets (HUIs) that
generate comparatively large candidate sets, our main
focus is on significantly reducing the computation time
with the introduction of new pruning strategies. The
designed pruning strategies help to reduce the
visitation of unnecessary nodes in the search space,
which reduces the time required by the algorithm. In
this article, two new stricter upper bounds are
designed to reduce the computation time by refraining
from visiting unnecessary nodes of an itemset. Thus,
the search space of the potential HUIs can be greatly
reduced, and the mining procedure of the execution time
can be improved. The proposed strategies can also
significantly minimize the transaction database
generated on each node. Experimental results showed
that the designed algorithm with two pruning strategies
outperform the state-of-the-art algorithms for mining
the required HUIs in terms of runtime and number of
revised candidates. The memory usage of the designed
algorithm also outperforms the state-of-the-art
approach. Moreover, a multi-thread concept is also
discussed to further handle the problem of big
acknowledgement = ack-nhfb,
articleno = "58",
fjournal = "ACM Transactions on Knowledge Discovery from Data
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
author = "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
Rudolf Eigenmann and Timothy G. Rogers",
title = "{Pagoda}: a {GPU} Runtime System for Narrow Tasks",
journal = j-TOPC,
volume = "6",
number = "4",
pages = "21:1--21:??",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365657",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Nov 20 07:59:59 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Massively multithreaded GPUs achieve high throughput
by running thousands of threads in parallel. To fully
utilize the their hardware, contemporary workloads
spawn work to the GPU in bulk by launching large tasks,
where each task is a kernel that contains thousands of
threads that occupy the entire GPU. GPUs face severe
underutilization and their performance benefits vanish
if the tasks are narrow, i.e., they contain less than
512 threads. Latency-sensitive applications in network,
signal, and image processing that generate a large
number of tasks with relatively small inputs are
examples of such limited parallelism. This article
presents Pagoda, a runtime system that virtualizes GPU
resources, using an OS-like daemon kernel called
MasterKernel. Tasks are spawned from the CPU onto
Pagoda as they become available, and are scheduled by
the MasterKernel at the warp granularity. This level of
control enables the GPU to keep scheduling and
executing tasks as long as free warps are found,
dramatically reducing underutilization. Experimental
results on real hardware demonstrate that Pagoda
achieves a geometric mean speedup of 5.52X over
PThreads running on a 20-core CPU, 1.76X over
CUDA-HyperQ, and 1.44X over GeMTC, the state-of-the-art
runtime GPU task scheduling system.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
author = "Guanwen Zhong and Akshat Dubey and Cheng Tan and
Tulika Mitra",
title = "{Synergy}: an {HW\slash SW} Framework for High
Throughput {CNNs} on Embedded Heterogeneous {SoC}",
journal = j-TECS,
volume = "18",
number = "2",
pages = "13:1--13:??",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3301278",
ISSN = "1539-9087 (print), 1558-3465 (electronic)",
ISSN-L = "1539-9087",
bibdate = "Thu Oct 17 18:16:43 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/ft_gateway.cfm?id=3301278",
abstract = "Convolutional Neural Networks (CNN) have been widely
deployed in diverse application domains. There has been
significant progress in accelerating both their
training and inference using high-performance GPUs,
FPGAs, and custom ASICs for datacenter-scale
environments. The recent proliferation of mobile and
Internet of Things (IoT) devices have necessitated
real-time, energy-efficient deep neural network
inference on embedded-class, resource-constrained
platforms. In this context, we present Synergy, an
automated, hardware-software co-designed, pipelined,
high-throughput CNN inference framework on embedded
heterogeneous system-on-chip (SoC) architectures
(Xilinx Zynq). Synergy leverages, through
multi-threading, all the available on-chip resources,
which includes the dual-core ARM processor along with
the FPGA and the NEON Single-Instruction Multiple-Data
(SIMD) engines as accelerators. Moreover, Synergy
provides a unified abstraction of the heterogeneous
accelerators (FPGA and NEON) and can adapt to different
network configurations at runtime without changing the
underlying hardware accelerator architecture by
balancing workload across accelerators through
work-stealing. Synergy achieves 7.3X speedup, averaged
across seven CNN models, over a well-optimized
software-only solution. Synergy demonstrates
substantially better throughput and energy-efficiency
compared to the contemporary CNN implementations on the
same SoC architecture.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Embedded Computing Systems",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840",
author = "Vasileios Zois and Vassilis J. Tsotras and Walid A.
title = "Efficient main-memory top-$k$ selection for multicore
volume = "13",
number = "2",
pages = "114--127",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364327",
ISSN = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
abstract = "Efficient Top-$k$ query evaluation relies on practices
that utilize auxiliary data structures to enable early
termination. Such techniques were designed to trade-off
complex work in the buffer pool against costly access
to disk-resident data. Parallel in-memory Top-$k$
selection with support for early termination presents a
novel challenge because computation shifts higher up in
the memory hierarchy. In this environment, data scan
methods using SIMD instructions and multithreading
perform well despite requiring evaluation of the
complete dataset. Early termination schemes that favor
simplicity require random access to resolve score
ambiguity while those optimized for sequential access
incur too many object evaluations. In this work, we
introduce the concept of rank uncertainty, a measure of
work efficiency that enables classifying existing
solutions according to their potential for efficient
parallel in-memory Top-fc selection. We identify data
reordering and layering strategies as those having the
highest potential and provide practical guidelines on
how to adapt them for parallel in-memory execution
(creating the VTA and SLA approaches). In addition, we
show that the number of object evaluations can be
further decreased by combining data reordering with
angle space partitioning (introducing PTA). Our
extensive experimental evaluation on varying query
parameters using both synthetic and real data, showcase
that PTA exhibits between 2 and 4 orders of magnitude
better query latency, and throughput when compared to
prior work and our optimized algorithmic variants (i.e.
VTA, SLA).",
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
author = "Abdulelah Algosaibi and Khaled Ragab and Saleh
title = "Parallel-Based Techniques for Managing and Analyzing
the Performance on Semantic Graph",
volume = "30",
number = "02",
pages = "??--??",
month = jun,
year = "2020",
DOI = "https://doi.org/10.1142/S0129626420500073",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Mon Mar 29 12:30:13 MDT 2021",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626420500073",
abstract = "In recent years, data are generated rapidly that
advanced the evolving of the linked data. Modern data
are globally distributed over the semantically linked
graphs. The nature of the distributed data over the
semantic graph raised new demands on further
investigation on improving performance on the semantic
graphs. In this work, we analyzed the time latency as
an important factor to be further investigated and
improved. We evaluated the parallel computing on these
distributed data in order to better utilize the
parallelism approaches. A federation framework based on
a multi-threaded environment supporting federated
SPARQL query was introduced. In our experiments, we
show the achievability and effectiveness of our model
on a set of real-world quires through real-world Linked
Open Data cloud. Significant performance improvement
has noticed. Further, we highlight short-comings that
could open an avenue in the research of federated
queries. Keywords: Semantic web; distributed query
processing; query federation; linked data; join
acknowledgement = ack-nhfb,
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "Mehdi Bagherzadeh and Nicholas Fireman and Anas
Shawesh and Raffi Khatchadourian",
title = "Actor concurrency bugs: a comprehensive study on
symptoms, root causes, {API} usages, and differences",
journal = j-PACMPL,
volume = "4",
number = "OOPSLA",
pages = "214:1--214:32",
month = nov,
year = "2020",
DOI = "https://doi.org/10.1145/3428282",
bibdate = "Tue Mar 30 08:10:50 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3428282",
abstract = "Actor concurrency is becoming increasingly important
in the development of real-world software systems.
Although actor concurrency may be less susceptible to
some multithreaded concurrency bugs, such as low-level
data races and deadlocks, it comes with \ldots{}",
acknowledgement = ack-nhfb,
articleno = "214",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "D. A. Barros and C. Bentes",
booktitle = "{2020 IEEE 32nd International Symposium on Computer
Architecture and High Performance Computing
title = "Analyzing the Loop Scheduling Mechanisms on {Julia}
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "257--264",
year = "2020",
DOI = "https://doi.org/10.1109/SBAC-PAD49847.2020.00043",
bibdate = "Thu Apr 8 07:17:08 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/julia.bib;
acknowledgement = ack-nhfb,
keywords = "Julia programming language",
author = "A. Castell{\'o} and R. M. Gual and S. Seo and P.
Balaji and E. S. Quintana-Ort{\'\i} and A. J.
title = "Analysis of Threading Libraries for High Performance
journal = j-IEEE-TRANS-COMPUT,
volume = "69",
number = "9",
pages = "1279--1292",
month = sep,
year = "2020",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed Aug 12 14:58:16 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "K. Criswell and T. Adegbija",
title = "A Survey of Phase Classification Techniques for
Characterizing Variable Application Behavior",
volume = "31",
number = "1",
pages = "224--236",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1109/TPDS.2019.2929781",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Dec 19 09:20:35 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "adaptable computing; Big Data; big data; Clocks;
Computational modeling; dynamic optimization; edge
computing; emerging applications; Hardware; Multicore
processing; multithreaded applications; Optimization;
Phase classification; Runtime; variable program
behavior; workload characterization",
author = "Ilke {\c{C}}ugu and Murat Manguoglu",
title = "A parallel multithreaded sparse triangular linear
system solver",
journal = j-COMPUT-MATH-APPL,
volume = "80",
number = "2",
pages = "371--385",
month = jul,
year = "2020",
DOI = "https://doi.org/10.1016/j.camwa.2019.09.012",
ISSN = "0898-1221 (print), 1873-7668 (electronic)",
ISSN-L = "0898-1221",
bibdate = "Wed Jul 8 08:11:16 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computmathappl2020.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0898122119304602",
acknowledgement = ack-nhfb,
fjournal = "Computers and Mathematics with Applications",
journal-URL = "http://www.sciencedirect.com/science/journal/08981221",
author = "Matthew G. F. Dosanjh and Ryan E. Grant and Whit
Schonbein and Patrick G. Bridges",
title = "Tail queues: a multi-threaded matching architecture",
journal = j-CCPE,
volume = "32",
number = "3",
pages = "e5158:1--e5158:??",
day = "10",
month = feb,
year = "2020",
DOI = "https://doi.org/10.1002/cpe.5158",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Wed Mar 31 07:52:13 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
ajournal = "Concurr. Comput.",
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "06 February 2019",
author = "J. Feliu and J. Sahuquillo and S. Petit and L.
title = "Thread Isolation to Improve Symbiotic Scheduling on
{SMT} Multicore Processors",
volume = "31",
number = "2",
pages = "359--373",
month = feb,
year = "2020",
DOI = "https://doi.org/10.1109/TPDS.2019.2934955",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Wed Jan 22 06:09:50 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds",
keywords = "Degradation; Message systems; Program processors;
Resource management; Schedules; Simultaneous
multithreading (SMT); Symbiosis; symbiotic job
scheduling; thread isolation; Throughput",
author = "Pietro Fezzardi and Fabrizio Ferrandi",
title = "Automated Bug Detection for High-level Synthesis of
Multi-threaded Irregular Applications",
journal = j-TOPC,
volume = "7",
number = "4",
pages = "27:1--27:26",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418086",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sun Mar 28 08:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3418086",
abstract = "Field Programmable Gate Arrays (FPGAs) are becoming an
appealing technology in datacenters and High
Performance Computing. High-Level Synthesis (HLS) of
multi-threaded parallel programs is increasingly used
to extract parallelism. Despite great leaps \ldots{}",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "https://dl.acm.org/loi/topc",
author = "Mehrdad Ghorbani and Seyed Morteza Babamir",
title = "Runtime deadlock tracking and prevention of concurrent
multithreaded programs: a learning-based approach",
journal = j-CCPE,
volume = "32",
number = "10",
pages = "e5324:1--e5324:??",
day = "25",
month = may,
year = "2020",
DOI = "https://doi.org/10.1002/cpe.5324",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Wed Mar 31 07:52:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib;
acknowledgement = ack-nhfb,
ajournal = "Concurr. Comput.",
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "09 May 2019",
author = "Rich Hickey",
title = "A history of {Clojure}",
journal = j-PACMPL,
volume = "4",
number = "HOPL",
pages = "71:1--71:46",
month = jun,
year = "2020",
DOI = "https://doi.org/10.1145/3386321",
bibdate = "Fri Aug 7 17:39:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3386321",
abstract = "Clojure was designed to be a general-purpose,
practical functional language, suitable for use by
professionals wherever its host language, e.g., Java,
would be. Initially designed in 2005 and released in
2007, Clojure is a dialect of Lisp, but is not a direct
descendant of any prior Lisp. It complements
programming with pure functions of immutable data with
concurrency-safe state management constructs that
support writing correct multithreaded programs without
the complexity of mutex locks.\par
Clojure is intentionally hosted, in that it compiles to
and runs on the runtime of another language, such as
the JVM. This is more than an implementation strategy;
numerous features ensure that programs written in
Clojure can leverage and interoperate with the
libraries of the host language directly and
In spite of combining two (at the time) rather
unpopular ideas, functional programming and Lisp,
Clojure has since seen adoption in industries as
diverse as finance, climate science, retail, databases,
analytics, publishing, healthcare, advertising and
genomics, and by consultancies and startups worldwide,
much to the career-altering surprise of its
Most of the ideas in Clojure were not novel, but their
combination puts Clojure in a unique spot in language
design (functional, hosted, Lisp). This paper recounts
the motivation behind the initial development of
Clojure and the rationale for various design decisions
and language constructs. It then covers its evolution
subsequent to release and adoption.",
acknowledgement = ack-nhfb,
articleno = "71",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Sungjin Im and Benjamin Moseley and Kamesh Munagala
and Kirk Pruhs",
title = "Dynamic Weighted Fairness with Minimal Disruptions",
journal = j-POMACS,
volume = "4",
number = "1",
pages = "19:1--19:18",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3379485",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Mon Mar 29 10:31:33 MDT 2021",
bibsource = "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
URL = "https://dl.acm.org/doi/10.1145/3379485",
abstract = "In this paper, we consider the following dynamic fair
allocation problem: Given a sequence of job arrivals
and departures, the goal is to maintain an
approximately fair allocation of the resource against a
target fair allocation policy, while minimizing he
total number of disruptions, which is the number of
times the allocation of any job is changed. We consider
a rich class of fair allocation policies that
significantly generalize those considered in previous
work. We first consider the models where jobs only
arrive, or jobs only depart. We present tight upper and
lower bounds for the number of disruptions required to
maintain a constant approximate fair allocation every
time step. In particular, for the canonical case where
jobs have weights and the resource allocation is
proportional to the job's weight, we show that
maintaining a constant approximate fair allocation
requires $ \Theta (\log^* n) $ disruptions per job,
almost matching the bounds in prior work for the unit
weight case. For the more general setting where the
allocation policy only decreases the allocation to a
job when new jobs arrive, we show that maintaining a
constant approximate fair allocation requires $ \Thta
(\log n) $ disruptions per job. We then consider the
model where jobs can both arrive and depart. We first
show strong lower bounds on the number of disruptions
required to maintain constant approximate fairness for
arbitrary instances. In contrast we then show that
there there is an algorithm that can maintain constant
approximate fairness with $ O(1) $ expected disruptions
per job if the weights of the jobs are independent of
the jobs arrival and departure order. We finally show
how our results can be extended to the setting with
multiple resources.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "https://dl.acm.org/loi/pomacs",
author = "Daniel Langr and Marin Ko{\v{c}}i{\v{c}}ka",
title = "Reducing the Impact of Intensive Dynamic Memory
Allocations in Parallel Multi-Threaded Programs",
volume = "31",
number = "5",
pages = "1152--1164",
month = may,
year = "2020",
DOI = "https://doi.org/10.1109/TPDS.2019.2960514",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Feb 20 10:08:58 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
keywords = "Dynamic memory allocation; memory pooling;
multi-threading; parallel program; scalable heap
implementation; shared memory; small buffer
author = "Tao Li and Xiankai Zhang and Feng Luo and Fang-Xiang
Wu and Jianxin Wang",
title = "{MultiMotifMaker}: a Multi-Thread Tool for Identifying
{DNA} Methylation Motifs from {Pacbio} Reads",
journal = j-TCBB,
volume = "17",
number = "1",
pages = "220--225",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1109/TCBB.2018.2861399",
ISSN = "1545-5963 (print), 1557-9964 (electronic)",
ISSN-L = "1545-5963",
bibdate = "Wed Jun 10 07:29:48 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1109/TCBB.2018.2861399",
abstract = "The methylation of DNA is an important mechanism to
control biological processes. Recently, the Pacbio SMRT
technology provides a new way to identify base
methylation in the genome. MotifMaker is a tool
developed by Pacbio for discovering DNA methylation
acknowledgement = ack-nhfb,
fjournal = "IEEE/ACM Transactions on Computational Biology and
journal-URL = "https://dl.acm.org/loi/tcbb",
author = "Jos{\'e} Puche and Salvador Petit and Mar{\'\i}a E.
G{\'o}mez and Julio Sahuquillo",
title = "An efficient cache flat storage organization for
multithreaded workloads for low power processors",
journal = j-FUT-GEN-COMP-SYS,
volume = "110",
number = "??",
pages = "1037--1054",
month = sep,
year = "2020",
DOI = "https://doi.org/10.1016/j.future.2019.11.024",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Fri Jun 19 07:44:19 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167739X1930384X",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Anita Tino and Caroline Collange and Andr{\'e}
title = "{SIMT-X}: Extending Single-Instruction Multi-Threading
to Out-of-Order Cores",
journal = j-TACO,
volume = "17",
number = "2",
pages = "15:1--15:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3392032",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jun 27 12:06:50 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3392032",
abstract = "This work introduces Single Instruction Multi-Thread
Express (SIMT-X), a general-purpose Central Processing
Unit (CPU) microarchitecture that enables Graphics
Processing Units (GPUs)-style SIMT execution across
multiple threads of the same program for high
throughput, while retaining the latency benefits of
out-of-order execution, and the programming convenience
of homogeneous multi-thread processors. SIMT-X
leverages the existing Single Instruction Multiple Data
(SIMD) back-end to provide CPU/GPU-like processing on a
single core with minimal overhead. We demonstrate that
although SIMT-X invokes a restricted form of
Out-of-Order (OoO), the microarchitecture successfully
captures a majority of the benefits of aggressive OoO
execution using at most two concurrent register
mappings per architectural register, while addressing
issues of partial dependencies and supporting a
general-purpose Instruction Set Architecture (ISA).",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "https://dl.acm.org/loi/taco",
author = "Tang Wenjie and Yao Yiping and Li Tianlin and Song
Xiao and Zhu Feng",
title = "An Adaptive Persistence and Work-stealing Combined
Algorithm for Load Balancing on Parallel Discrete Event
journal = j-TOMACS,
volume = "30",
number = "2",
pages = "12:1--12:26",
month = apr,
year = "2020",
DOI = "https://doi.org/10.1145/3364218",
ISSN = "1049-3301 (print), 1558-1195 (electronic)",
ISSN-L = "1049-3301",
bibdate = "Tue Apr 21 08:08:16 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/abs/10.1145/3364218",
abstract = "Load imbalance has always been a crucial challenge in
Parallel Discrete Event Simulation (PDES). In the past
few years, we have witnessed an increased interest in
using multithreading PDES on multi/many-core platforms.
In multithreading PDES, migrating \ldots{}",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Modeling and Computer Simulation",
journal-URL = "https://dl.acm.org/loi/tomacs",
author = "Alexander J. Yee",
title = "{{\tt y-cruncher}}: a multi-threaded pi-program",
howpublished = "Web site",
day = "30",
month = mar,
year = "2020",
bibdate = "Tue Apr 21 16:09:31 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.numberworld.org/y-cruncher/",
abstract = "How fast can your computer compute Pi?\par
y-cruncher is a program that can compute Pi and other
constants to trillions of digits.\par
It is the first of its kind that is multi-threaded and
scalable to multi-core systems. Ever since its launch
in 2009, it has become a common benchmarking and
stress-testing application for overclockers and
hardware enthusiasts.\par
y-cruncher has been used to set several world records
for the most digits of Pi ever computed:\par
50 trillion digits - January 2020 (Timothy
31.4 trillion digits - January 2019 (Emma Haruka
22.4 trillion digits - November 2016 (Peter
13.3 trillion digits - October 2014 (Sandon Van Ness
12.1 trillion digits - December 2013 (Shigeru
10 trillion digits - October 2011 (Shigeru Kondo)\par
5 trillion digits - August 2010 (Shigeru Kondo)",
acknowledgement = ack-nhfb,
author = "L. Yin and W. Dong and W. Liu and J. Wang",
title = "On Scheduling Constraint Abstraction for
Multi-Threaded Program Verification",
volume = "46",
number = "5",
pages = "549--565",
year = "2020",
ISSN = "0098-5589 (print), 1939-3520 (electronic)",
ISSN-L = "0098-5589",
bibdate = "Thu Sep 17 07:36:32 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Software Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32",
author = "Amir Akbari and Dennis Giannacopoulos",
title = "An efficient multi-threaded {Newton--Raphson}
algorithm for strong coupling modeling of multi-physics
journal = j-COMP-PHYS-COMM,
volume = "258",
number = "??",
pages = "Article 107563",
month = jan,
year = "2021",
DOI = "https://doi.org/10.1016/j.cpc.2020.107563",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Sat Mar 13 08:21:40 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465520302708",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "Arif Arman and Dmitri Loguinov",
title = "{Origami}: a high-performance mergesort framework",
volume = "15",
number = "2",
pages = "259--271",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489507",
ISSN = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.14778/3489496.3489507",
abstract = "Mergesort is a popular algorithm for sorting
real-world workloads as it is immune to data skewness,
suitable for parallelization using vectorized
intrinsics, and relatively simple to multi-thread. In
this paper, we introduce Origami, an in-memory merge-.
acknowledgement = ack-nhfb,
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
author = "Sanem Arslan and Osman Unsal",
title = "Efficient selective replication of critical code
regions for {SDC} mitigation leveraging redundant
volume = "77",
number = "12",
pages = "14130--14160",
month = dec,
year = "2021",
DOI = "https://doi.org/10.1007/s11227-021-03804-6",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Mon Feb 28 16:44:31 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
URL = "https://link.springer.com/article/10.1007/s11227-021-03804-6",
acknowledgement = ack-nhfb,
ajournal = "J. Supercomputing",
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Pascal Baumann and Rupak Majumdar and Ramanathan S.
Thinniyam and Georg Zetzsche",
title = "Context-bounded verification of liveness properties
for multithreaded shared-memory programs",
journal = j-PACMPL,
volume = "5",
number = "POPL",
pages = "44:1--44:31",
month = jan,
year = "2021",
DOI = "https://doi.org/10.1145/3434325",
bibdate = "Tue Mar 30 08:10:58 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3434325",
abstract = "We study context-bounded verification of liveness
properties of multi-threaded, shared-memory programs,
where each thread can spawn additional threads. Our
main result shows that context-bounded fair termination
is decidable for the model; context-. \ldots{}",
acknowledgement = ack-nhfb,
articleno = "44",
fjournal = "Proceedings of the ACM on Programming Languages",
journal-URL = "https://pacmpl.acm.org/",
author = "Shane Carroll and Wei-ming Lin",
title = "Exploiting Long-Term Temporal Cache Access Patterns
for {LRU} Insertion Prioritization",
volume = "31",
number = "02",
pages = "??--??",
month = jun,
year = "2021",
DOI = "https://doi.org/10.1142/S0129626421500109",
ISSN = "0129-6264 (print), 1793-642X (electronic)",
ISSN-L = "0129-6264",
bibdate = "Thu Feb 17 06:50:36 MST 2022",
bibsource = "http://ejournals.wspc.com.sg/ppl/;
URL = "https://www.worldscientific.com/doi/10.1142/S0129626421500109",
abstract = "In a CPU cache utilizing least recently used (LRU)
replacement, cache sets manage a buffer which orders
all cache lines in the set from LRU to most recently
used (MRU). When a cache line is brought into cache, it
is placed at the MRU and the LRU line is evicted. When
re-accessed, a line is promoted to the MRU position.
LRU replacement provides a simple heuristic to predict
the optimal cache line to evict. However, LRU utilizes
only simple, short-term access patterns. In this paper,
we propose a method that uses a buffer called the
history queue to record longer-term access-eviction
patterns than the LRU buffer can capture. Using this
information, we make a simple modification to LRU
insertion policy such that recently-recalled blocks
have priority over others. As lines are evicted, their
addresses are recorded in a FIFO history queue.
Incoming lines that have recently been evicted and now
recalled (those in the history queue at recall time)
remain in the MRU for an extended period of time as
non-recalled lines entering the cache thereafter are
placed below the MRU. We show that the proposed LRU
insertion prioritization increases performance in
single-threaded and multi-threaded workloads in
simulations with simple adjustments to baseline LRU.",
acknowledgement = ack-nhfb,
articleno = "2150010",
fjournal = "Parallel Processing Letters",
journal-URL = "http://www.worldscientific.com/loi/ppl",
author = "A. Cheikh and S. Sordillo and A. Mastrandrea and F.
Menichelli and G. Scotti and M. Olivieri",
title = "{Klessydra-T}: Designing Vector Coprocessors for
Multithreaded Edge-Computing Cores",
journal = j-IEEE-MICRO,
volume = "41",
number = "2",
pages = "64--71",
month = mar # "\slash " # apr,
year = "2021",
DOI = "https://doi.org/10.1109/MM.2021.3050962",
ISSN = "0272-1732 (print), 1937-4143 (electronic)",
ISSN-L = "0272-1732",
bibdate = "Thu Apr 1 10:32:23 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Micro",
journal-URL = "http://www.computer.org/csdl/mags/mi/index.html",
author = "Stefano Conoci and Pierangelo {Di Sanzo} and
Alessandro Pellegrini and Bruno Ciciani and Francesco
title = "On power capping and performance optimization of
multithreaded applications",
journal = j-CCPE,
volume = "33",
number = "13",
pages = "e6205:1--e6205:??",
day = "10",
month = jul,
year = "2021",
DOI = "https://doi.org/10.1002/cpe.6205",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Feb 22 09:49:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib;
acknowledgement = ack-nhfb,
ajournal = "Concurrency Computat., Pract. Exper.",
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "27 January 2021",
author = "Claudio Kozick{\'y} and Ivan Simecek",
title = "Joint direct and transposed sparse matrix-vector
multiplication for multithreaded {CPUs}",
journal = j-CCPE,
volume = "33",
number = "13",
pages = "e6236:1--e6236:??",
day = "10",
month = jul,
year = "2021",
DOI = "https://doi.org/10.1002/cpe.6236",
ISSN = "1532-0626 (print), 1532-0634 (electronic)",
ISSN-L = "1532-0626",
bibdate = "Tue Feb 22 09:49:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib;
acknowledgement = ack-nhfb,
ajournal = "Concurrency Computat., Pract. Exper.",
fjournal = "Concurrency and Computation: Practice and Experience",
journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626",
onlinedate = "22 February 2021",
author = "Botao Li and Synge Todo and A. C. Maggs and Werner
title = "Multithreaded event-chain {Monte Carlo} with local
journal = j-COMP-PHYS-COMM,
volume = "261",
number = "??",
pages = "Article 107702",
month = apr,
year = "2021",
DOI = "https://doi.org/10.1016/j.cpc.2020.107702",
ISSN = "0010-4655 (print), 1879-2944 (electronic)",
ISSN-L = "0010-4655",
bibdate = "Sat Mar 13 08:21:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0010465520303453",
acknowledgement = ack-nhfb,
fjournal = "Computer Physics Communications",
journal-URL = "http://www.sciencedirect.com/science/journal/00104655",
author = "Xiaoxue Ma and Shangru Wu and Ernest Pobee and Xiupei
Mei and Hao Zhang and Bo Jiang and Wing-Kwong Chan",
title = "{RegionTrack}: a Trace-Based Sound and Complete
Checker to Debug Transactional Atomicity Violations and
Non-Serializable Traces",
journal = j-TOSEM,
volume = "30",
number = "1",
pages = "7:1--7:49",
month = jan,
year = "2021",
DOI = "https://doi.org/10.1145/3412377",
ISSN = "1049-331X (print), 1557-7392 (electronic)",
ISSN-L = "1049-331X",
bibdate = "Fri Jan 22 07:02:14 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3412377",
abstract = "Atomicity is a correctness criterion to reason about
isolated code regions in a multithreaded program when
they are executed concurrently. However, dynamic
instances of these code regions, called transactions,
may fail to behave atomically, resulting in \ldots{}",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Software Engineering and
journal-URL = "https://dl.acm.org/loi/tosem",
author = "Timothy G. Mattson and Todd A. Anderson and Giorgis
title = "\pkg{PyOMP}: Multithreaded Parallel Programming in
journal = j-COMPUT-SCI-ENG,
volume = "23",
number = "6",
pages = "77--80",
month = nov # "\slash " # dec,
year = "2021",
DOI = "https://doi.org/10.1109/MCSE.2021.3128806",
ISSN = "1521-9615 (print), 1558-366X (electronic)",
ISSN-L = "1521-9615",
bibdate = "Mon Jan 31 16:30:09 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib;
acknowledgement = ack-nhfb,
fjournal = "Computing in Science and Engineering",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992",
author = "Paul Metzger and Volker Seeker and Christian Fensch
and Murray Cole",
title = "Device Hopping: Transparent Mid-Kernel Runtime
Switching for Heterogeneous Systems",
journal = j-TACO,
volume = "18",
number = "4",
pages = "57:1--57:25",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471909",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 4 07:14:07 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3471909",
abstract = "Existing OS techniques for homogeneous many-core
systems make it simple for single and multithreaded
applications to migrate between cores. Heterogeneous
systems do not benefit so fully from this flexibility,
and applications that cannot migrate in mid-.
acknowledgement = ack-nhfb,
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
journal-URL = "https://dl.acm.org/loi/taco",
author = "Thomas Nagler",
title = "Code Snippet: {R}-Friendly Multi-Threading in {C++}",
journal = j-J-STAT-SOFT,
volume = "97",
number = "??",
pages = "??--??",
month = "????",
year = "2021",
DOI = "https://doi.org/10.18637/jss.v97.c01",
ISSN = "1548-7660",
ISSN-L = "1548-7660",
bibdate = "Wed May 19 07:43:42 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib;
URL = "https://www.jstatsoft.org/index.php/jss/article/view/v097c01;
acknowledgement = ack-nhfb,
journal-URL = "http://www.jstatsoft.org/",
author = "Bashar Romanous and Skyler Windh and Vassilis
title = "Efficient local locking for massively multithreaded
in-memory hash-based operators",
journal = j-VLDB-J,
volume = "30",
number = "3",
pages = "333--359",
month = may,
year = "2021",
DOI = "https://doi.org/10.1007/s00778-020-00642-5",
ISSN = "1066-8888 (print), 0949-877X (electronic)",
ISSN-L = "1066-8888",
bibdate = "Sat Apr 9 10:33:58 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
URL = "https://link.springer.com/article/10.1007/s00778-020-00642-5",
acknowledgement = ack-nhfb,
ajournal = "VLDB J.",
fjournal = "VLDB Journal: Very Large Data Bases",
journal-URL = "http://portal.acm.org/toc.cfm?id=J869",
author = "Nikki Sonenberg and Grzegorz Kielanski and Benny {Van
title = "Performance Analysis of Work Stealing in Large-scale
Multithreaded Computing",
journal = j-TOMPECS,
volume = "6",
number = "2",
pages = "6:1--6:28",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3470887",
ISSN = "2376-3639 (print), 2376-3647 (electronic)",
ISSN-L = "2376-3639",
bibdate = "Wed Mar 2 06:32:09 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3470887",
abstract = "Randomized work stealing is used in distributed
systems to increase performance and improve resource
utilization. In this article, we consider randomized
work stealing in a large system of homogeneous
processors where parent jobs spawn child jobs that can
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Modeling and Performance
Evaluation of Computing Systems (TOMPECS)",
journal-URL = "https://dl.acm.org/loi/tompecs",
author = "Guy L. {Steele Jr.} and Sebastiano Vigna",
title = "\pkg{LXM}: better splittable pseudorandom number
generators (and almost as fast)",
journal = j-PACMPL,
volume = "5",
number = "OOPSLA",
pages = "148:1--148:31",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485525",
ISSN = "2475-1421 (electronic)",
ISSN-L = "2475-1421",
bibdate = "Wed Mar 2 07:00:43 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
URL = "https://dl.acm.org/doi/10.1145/3485525",
abstract = "In 2014, Steele, Lea, and Flood presented SplitMix, an
object-oriented pseudorandom number generator (prng)
that is quite fast (9 64-bit arithmetic/logical
operations per 64 bits generated) and also splittable.
A conventional prng object provides a generate method
that returns one pseudorandom value and updates the
state of the prng; a splittable prng object also has a
second operation, split, that replaces the original
prng object with two (seemingly) independent prng
objects, by creating and returning a new such object
and updating the state of the original object.
Splittable prng objects make it easy to organize the
use of pseudorandom numbers in multithreaded programs
structured using fork-join parallelism. This overall
strategy still appears to be sound, but the specific
arithmetic calculation used for generate in the
SplitMix algorithm has some detectable weaknesses, and
the period of any one generator is limited to
Here we present the LXM family of prng algorithms. The
idea is an old one: combine the outputs of two
independent prng algorithms, then (optionally) feed the
result to a mixing function. An LXM algorithm uses a
linear congruential subgenerator and an F2-linear
subgenerator; the examples studied in this paper use a
linear congruential generator (LCG) of period 216, 232,
264, or 2128 with one of the multipliers recommended by
L'Ecuyer or by Steele and Vigna, and an F2-linear
xor-based generator (XBG) of the xoshiro family or
xoroshiro family as described by Blackman and Vigna.
For mixing functions we study the MurmurHash3 finalizer
function; variants by David Stafford, Doug Lea, and
degski; and the null (identity) mixing
Like SplitMix, LXM provides both a generate operation
and a split operation. Also like SplitMix, LXM requires
no locking or other synchronization (other than the
usual memory fence after instance initialization), and
is suitable for use with simd instruction sets because
it has no branches or loops.\par
We analyze the period and equidistribution properties
of LXM generators, and present the results of thorough
testing of specific members of this family, using the
TestU01 and PractRand test suites, not only on single
instances of the algorithm but also for collections of
instances, used in parallel, ranging in size from 2 to
224. Single instances of LXM that include a strong
mixing function appear to have no major weaknesses, and
LXM is significantly more robust than SplitMix against
accidental correlation in a multithreaded setting. We
believe that LXM, like SplitMix, is suitable for
``everyday'' scientific and machine-learning
applications (but not cryptographic applications),
especially when concurrent threads or distributed
processes are involved.",
acknowledgement = ack-nhfb,
articleno = "148",
fjournal = "Proceedings of the ACM on Programming Languages
journal-URL = "https://dl.acm.org/loi/pacmpl",
author = "Xulong Tang and Mahmut Taylan Kandemir and Mustafa
title = "Mix and Match: Reorganizing Tasks for Enhancing Data
journal = j-POMACS,
volume = "5",
number = "2",
pages = "20:1--20:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460087",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Wed Mar 2 06:36:38 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pomacs.bib;
URL = "https://dl.acm.org/doi/10.1145/3460087",
abstract = "Application programs that exhibit strong locality of
reference lead to minimized cache misses and better
performance in different architectures. However, to
maximize the performance of multithreaded applications
running on emerging manycore systems, \ldots{}",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "https://dl.acm.org/loi/pomacs",
author = "Xiaohan Tao and Jianmin Pang and Yu Zhu",
title = "Compiler-directed scratchpad memory data transfer
optimization for multithreaded applications on a
heterogeneous many-core architecture",
volume = "77",
number = "12",
pages = "14502--14524",
month = dec,
year = "2021",
DOI = "https://doi.org/10.1007/s11227-021-03853-x",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Mon Feb 28 16:44:31 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
URL = "https://link.springer.com/article/10.1007/s11227-021-03853-x",
acknowledgement = ack-nhfb,
ajournal = "J. Supercomputing",
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "M. A. Anju and Rupesh Nasre",
title = "Multi-Interval {DomLock}: Toward Improving Concurrency
in Hierarchies",
journal = j-TOPC,
volume = "9",
number = "3",
pages = "12:1--12:27",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3543543",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Sep 20 09:34:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/topc.bib;
URL = "https://dl.acm.org/doi/10.1145/3543543",
abstract = "Locking has been a predominant technique depended upon
for achieving thread synchronization and ensuring
correctness in multi-threaded applications. It has been
established that the concurrent applications working
with hierarchical data witness \ldots{}",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "https://dl.acm.org/loi/topc",
author = "Jianyi Cheng and Shane T. Fleming and Yu Ting Chen and
Jason Anderson and John Wickerson and George A.
title = "Efficient Memory Arbitration in High-Level Synthesis
From Multi-Threaded Code",
journal = j-IEEE-TRANS-COMPUT,
volume = "71",
number = "4",
pages = "933--946",
month = apr,
year = "2022",
DOI = "https://doi.org/10.1109/TC.2021.3066466",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Mar 17 06:38:17 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Josu{\'e} Feliu and Ajeya Naithani and Julio
Sahuquillo and Salvador Petit and Moinuddin Qureshi and
Lieven Eeckhout",
title = "{VMT}: Virtualized Multi-Threading for Accelerating
Graph Workloads on Commodity Processors",
journal = j-IEEE-TRANS-COMPUT,
volume = "71",
number = "6",
pages = "1386--1398",
month = jun,
year = "2022",
DOI = "https://doi.org/10.1109/TC.2021.3086069",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Wed May 25 09:41:19 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Omar Inverso and Ermenegildo Tomasco and Bernd Fischer
and Salvatore {La Torre} and Gennaro Parlato",
title = "Bounded Verification of Multi-threaded Programs via
Lazy Sequentialization",
journal = j-TOPLAS,
volume = "44",
number = "1",
pages = "1:1--1:50",
month = mar,
year = "2022",
DOI = "https://doi.org/10.1145/3478536",
ISSN = "0164-0925 (print), 1558-4593 (electronic)",
ISSN-L = "0164-0925",
bibdate = "Fri Jan 14 06:53:13 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "https://dl.acm.org/doi/10.1145/3478536",
abstract = "Bounded verification techniques such as bounded model
checking (BMC) have successfully been used for many
practical program analysis problems, but concurrency
still poses a challenge. Here, we describe a new
approach to BMC of sequentially consistent \ldots{}",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Programming Languages and
journal-URL = "https://dl.acm.org/loi/toplas",
author = "Vasilios Kelefouras and Karim Djemame",
title = "Workflow simulation and multi-threading aware task
scheduling for heterogeneous computing",
journal = j-J-PAR-DIST-COMP,
volume = "168",
number = "??",
pages = "17--32",
month = oct,
year = "2022",
DOI = "https://doi.org/10.1016/j.jpdc.2022.05.011",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Sat Jul 16 10:35:47 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp2020.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731522001265",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Jonas H. M{\"u}ller Kornd{\"o}rfer and Ahmed Eleliemy
and Ali Mohammed and Florina M. Ciorba",
title = "{LB4OMP}: a Dynamic Load Balancing Library for
Multithreaded Applications",
volume = "33",
number = "4",
pages = "830--841",
month = apr,
year = "2022",
DOI = "https://doi.org/10.1109/TPDS.2021.3107775",
ISSN = "1045-9219 (print), 1558-2183 (electronic)",
ISSN-L = "1045-9219",
bibdate = "Thu Nov 11 08:39:34 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Parallel and Distributed
journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71",
author = "Marco Minutoli and Vito Giovanni Castellana and Nicola
Saporetti and Stefano Devecchi and Marco Lattuada and
Pietro Fezzardi and Antonino Tumeo and Fabrizio
title = "\pkg{Svelto}: High-Level Synthesis of Multi-Threaded
Accelerators for Graph Analytics",
journal = j-IEEE-TRANS-COMPUT,
volume = "71",
number = "3",
pages = "520--533",
month = mar,
year = "2022",
DOI = "https://doi.org/10.1109/TC.2021.3057860",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Feb 17 08:09:56 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Mirko Myllykoski",
title = "{Algorithm 1019}: a Task-based Multi-shift {$ Q R $
\slash $ Q Z $} Algorithm with Aggressive Early
journal = j-TOMS,
volume = "48",
number = "1",
pages = "11:1--11:36",
month = mar,
year = "2022",
DOI = "https://doi.org/10.1145/3495005",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Thu Feb 17 08:00:57 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/toms.bib;
URL = "https://dl.acm.org/doi/10.1145/3495005",
abstract = "The $ Q R $ algorithm is one of the three phases in
the process of computing the eigenvalues and the
eigenvectors of a dense nonsymmetric matrix. This paper
describes a task-based $ Q R $ algorithm for reducing
an upper Hessenberg matrix to real Schur form. The
task-based algorithm also supports generalized
eigenvalue problems ($ Q Z $ algorithm) but this paper
concentrates on the standard case. The task-based
algorithm adopts previous algorithmic improvements,
such as tightly-coupled multi-shifts and Aggressive
Early Deflation (AED), and also incorporates several
new ideas that significantly improve the performance.
This includes, but is not limited to, the elimination
of several synchronization points, the dynamic merging
of previously separate computational steps, the
shortening and the prioritization of the critical path,
and experimental GPU support. The task-based
implementation is demonstrated to be multiple times
faster than multi-threaded LAPACK and ScaLAPACK in both
single-node and multi-node configurations on two
different machines based on Intel and AMD CPUs. The
implementation is built on top of the StarPU runtime
system and is part of the open-source StarNEig
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "https://dl.acm.org/loi/toms",
author = "Matthieu Perrin and Achour Most{\'e}faoui and Ludmila
title = "Extending the wait-free hierarchy to multi-threaded
journal = j-DISTRIB-COMPUT,
volume = "35",
number = "4",
pages = "375--398",
month = aug,
year = "2022",
DOI = "https://doi.org/10.1007/s00446-022-00425-x",
ISSN = "0178-2770 (print), 1432-0452 (electronic)",
ISSN-L = "0178-2770",
bibdate = "Mon Aug 1 08:49:35 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/distribcomput.bib;
URL = "https://link.springer.com/article/10.1007/s00446-022-00425-x",
acknowledgement = ack-nhfb,
ajournal = "Distrib. comput.",
fjournal = "Distributed Computing",
journal-URL = "https://link.springer.com/journal/446",
author = "Luc{\'{\i}}a Pons and Josu{\'e} Feliu and Jos{\'e}
Puche and Chaoyi Huang and Salvador Petit and Julio
Pons and Mar{\'{\i}}a E. G{\'o}mez and Julio
title = "Effect of Hyper-Threading in Latency-Critical
Multithreaded Cloud Applications and Utilization
Analysis of the Major System Resources",
journal = j-FUT-GEN-COMP-SYS,
volume = "131",
number = "??",
pages = "194--208",
month = jun,
year = "2022",
DOI = "https://doi.org/10.1016/j.future.2022.01.025",
ISSN = "0167-739X (print), 1872-7115 (electronic)",
ISSN-L = "0167-739X",
bibdate = "Wed Mar 9 17:27:32 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys2020.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0167739X22000334",
acknowledgement = ack-nhfb,
fjournal = "Future Generation Computer Systems",
journal-URL = "http://www.sciencedirect.com/science/journal/0167739X",
author = "Azalea Raad and Luc Maranget and Viktor Vafeiadis",
title = "Extending {Intel-x86} consistency and persistency:
formalising the semantics of {Intel-x86} memory types
and non-temporal stores",
journal = j-PACMPL,
volume = "6",
number = "POPL",
pages = "22:1--22:31",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3498683",
ISSN = "2475-1421 (electronic)",
ISSN-L = "2475-1421",
bibdate = "Thu May 26 06:32:48 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib;
URL = "https://dl.acm.org/doi/10.1145/3498683",
abstract = "Existing semantic formalisations of the Intel-x86
architecture cover only a small fragment of its
available features that are relevant for the
consistency semantics of multi-threaded programs as
well as the persistency semantics of programs
interfacing \ldots{}",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "Proceedings of the ACM on Programming Languages
journal-URL = "https://dl.acm.org/loi/pacmpl",
author = "Robert Ritchie and Khodakhast Bibak",
title = "\pkg{DOTMIX-Pro}: faster and more efficient variants
of {DOTMIX} for dynamic-multithreading platforms",
volume = "78",
number = "1",
pages = "945--961",
month = jan,
year = "2022",
DOI = "https://doi.org/10.1007/s11227-021-03904-3",
ISSN = "0920-8542 (print), 1573-0484 (electronic)",
ISSN-L = "0920-8542",
bibdate = "Mon Feb 28 16:44:33 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib;
URL = "https://link.springer.com/article/10.1007/s11227-021-03904-3",
acknowledgement = ack-nhfb,
ajournal = "J. Supercomputing",
fjournal = "The Journal of Supercomputing",
journal-URL = "http://link.springer.com/journal/11227",
author = "Alfonso Rodr{\'\i}guez and Andr{\'e}s Otero and Marco
Platzner and Eduardo de la Torre",
title = "Exploiting Hardware-Based Data-Parallel and
Multithreading Models for Smart Edge Computing in
Reconfigurable {FPGAs}",
journal = j-IEEE-TRANS-COMPUT,
volume = "71",
number = "11",
pages = "2903--2914",
month = nov,
year = "2022",
DOI = "https://doi.org/10.1109/TC.2021.3107196",
ISSN = "0018-9340 (print), 1557-9956 (electronic)",
ISSN-L = "0018-9340",
bibdate = "Thu Oct 27 15:52:25 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Computers",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
author = "Michail Schwab and David Saffo and Nicholas Bond and
Shash Sinha and Cody Dunne and Jeff Huang and James
Tompkin and Michelle A. Borkin",
title = "Scalable Scalable Vector Graphics: Automatic
Translation of Interactive {SVGs} to a Multithread
{VDOM} for Fast Rendering",
volume = "28",
number = "9",
pages = "3219--3234",
month = sep,
year = "2022",
DOI = "https://doi.org/10.1109/TVCG.2021.3059294",
ISSN = "1077-2626",
ISSN-L = "1077-2626",
bibdate = "Thu Aug 4 06:28:31 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph2020.bib;
acknowledgement = ack-nhfb,
fjournal = "IEEE Transactions on Visualization and Computer
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945",
author = "James D. Trotter and Xing Cai and Simon W. Funke",
title = "On Memory Traffic and Optimisations for Low-order
Finite Element Assembly Algorithms on Multi-core
journal = j-TOMS,
volume = "48",
number = "2",
pages = "19:1--19:31",
month = jun,
year = "2022",
DOI = "https://doi.org/10.1145/3503925",
ISSN = "0098-3500 (print), 1557-7295 (electronic)",
ISSN-L = "0098-3500",
bibdate = "Wed Jul 20 07:04:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/toms.bib;
URL = "https://dl.acm.org/doi/10.1145/3503925",
abstract = "Motivated by the wish to understand the achievable
performance of finite element assembly on unstructured
computational meshes, we dissect the standard cellwise
assembly algorithm into four kernels, two of which are
dominated by irregular memory traffic. Several
optimisation schemes are studied together with
associated lower and upper bounds on the estimated
memory traffic volume. Apart from properly reordering
the mesh entities, the two most significant
optimisations include adopting a lookup table in adding
element matrices or vectors to their global
counterparts, and using a row-wise assembly algorithm
for multi-threaded parallelisation. Rigorous
benchmarking shows that, due to the various
optimisations, the actual volumes of memory traffic are
in many cases very close to the estimated lower bounds.
These results confirm the effectiveness of the
optimisations, while also providing a recipe for
developing efficient software for finite element
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Mathematical Software (TOMS)",
journal-URL = "https://dl.acm.org/loi/toms",
author = "Zhe Wang and Chen Xu and Kunal Agrawal and Jing Li",
title = "Adaptive scheduling of multiprogrammed
dynamic-multithreading applications",
journal = j-J-PAR-DIST-COMP,
volume = "162",
number = "??",
pages = "76--88",
month = apr,
year = "2022",
DOI = "https://doi.org/10.1016/j.jpdc.2022.01.009",
ISSN = "0743-7315 (print), 1096-0848 (electronic)",
ISSN-L = "0743-7315",
bibdate = "Thu Feb 10 06:39:27 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib;
URL = "http://www.sciencedirect.com/science/article/pii/S0743731522000144",
acknowledgement = ack-nhfb,
fjournal = "Journal of Parallel and Distributed Computing",
journal-URL = "http://www.sciencedirect.com/science/journal/07437315",
author = "Changwei Zou and Xudong Wang and Yaoqing Gao and
Jingling Xue",
title = "Buddy Stacks: Protecting Return Addresses with
Efficient Thread-Local Storage and Runtime
journal = j-TOSEM,
volume = "31",
number = "2",
pages = "35e:1--35e:37",
month = apr,
year = "2022",
DOI = "https://doi.org/10.1145/3494516",
ISSN = "1049-331X (print), 1557-7392 (electronic)",
ISSN-L = "1049-331X",
bibdate = "Tue May 24 07:09:20 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tosem.bib;
URL = "https://dl.acm.org/doi/10.1145/3494516",
abstract = "Shadow stacks play an important role in protecting
return addresses to mitigate ROP attacks. Parallel
shadow stacks, which shadow the call stack of each
thread at the same constant offset for all threads, are
known not to support multi-threading well. On
acknowledgement = ack-nhfb,
articleno = "35e",
fjournal = "ACM Transactions on Software Engineering and
journal-URL = "https://dl.acm.org/loi/tosem",
editor = "{IEEE}",
booktitle = "Workstation Operating Systems: Proceedings of the
Second Workshop on Workstation Operating Systems
(WWOS-II), Pacific Grove, CA, USA, September 27--29,
title = "Workstation Operating Systems: Proceedings of the
Second Workshop on Workstation Operating Systems
({WWOS}-{II}), Pacific Grove, {CA}, {USA}, September
27--29, 1989",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xi + 134",
year = "1989",
bibdate = "Sat Sep 28 20:21:01 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
note = "IEEE catalog number 89TH0281-6.",
acknowledgement = ack-nhfb,
classification = "B0100 (General electrical engineering topics);
B6210L (Computer communications); C5430
(Microcomputers); C5620 (Computer networks and
techniques); C5630 (Networking equipment); C6120 (File
organisation); C6150J (Operating systems); C6155
(Computer communications software)",
confsponsor = "IEEE",
keywords = "AIX3; At-most-once message; Coda file system; Echo
distributed file system; Fault-tolerant multiprocessor
workstations; File implementation; File-server
statelessness; Global communication interface; Guide
operating system; Large-scale applications; Mach;
Multimedia applications; Object-oriented environments;
Open operating system; Parallel algorithms; PLURIX;
PROST; Prototype information environment; Raven
project; Replicated servers; Shared memory; Sprite;
Synchronized clocks; Ubik database; Very large
distributed systems; Virtual memory; Virtual systems;
Workstation networks; Workstation-network communication
interface; X-kernel",
thesaurus = "Computer communications software; Computer networks;
File organisation; File servers; Operating systems
[computers]; Workstations",
editor = "{USENIX Association}",
booktitle = "Proceedings of the Winter 1989 {USENIX} Conference:
January 30--February 3, 1989, San Diego, California,
title = "Proceedings of the Winter 1989 {USENIX} Conference:
January 30--February 3, 1989, San Diego, California,
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "x + 471",
year = "1989",
bibdate = "Sun Feb 18 07:46:09 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "UNIX (Computer operating system) --- Congresses.",
editor = "Anonymous",
booktitle = "Proceedings of the Winter 1990 USENIX Conference,
Washington, DC, USA, January 22--26, 1990",
title = "Proceedings of the Winter 1990 {USENIX} Conference,
Washington, {DC}, {USA}, January 22--26, 1990",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "xvi + 374",
year = "1990",
bibdate = "Sat Sep 28 20:03:34 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
editor = "{IEEE}",
booktitle = "Proceedings, Supercomputing '90: November 12--16,
1990, New York Hilton at Rockefeller Center, New York,
New York",
title = "Proceedings, Supercomputing '90: November 12--16,
1990, New York Hilton at Rockefeller Center, New York,
New York",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xxv + 982",
year = "1990",
ISBN = "0-8186-2056-0 (paperback: IEEE Computer Society),
0-89791-412-0 (paperback: ACM)",
ISBN-13 = "978-0-8186-2056-0 (paperback: IEEE Computer Society),
978-0-89791-412-3 (paperback: ACM)",
LCCN = "QA 76.88 S87 1990",
bibdate = "Wed Aug 28 06:48:31 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
University of California MELVYL catalog.",
note = "ACM order number 415903. IEEE Computer Society Press
order number 2056. IEEE catalog number 90CH2916-5.",
acknowledgement = ack-nhfb,
classification = "C5440 (Multiprocessor systems and techniques); C5470
(Performance evaluation and testing); C6110 (Systems
analysis and programming); C7000 (Computer
keywords = "biological applications; computer applications;
computer chess; innovative architectures; linear
algebra algorithms; memory; networking computing;
parallel languages; parallel processing; particle
transport; partitioning; performance evaluation;
performance visualizations; pipeline processing;
program analysis; program restructuring; scheduling;
supercomputers --- congresses; vector algorithms",
editor = "Anonymous",
booktitle = "{Proceedings of the International Symposium on
Supercomputing: Fukuoka, Japan, November 6--8, 1991}",
title = "{Proceedings of the International Symposium on
Supercomputing: Fukuoka, Japan, November 6--8, 1991}",
publisher = "Kyushu University Press",
address = "Fukuoka, Japan",
pages = "iv + 261",
year = "1991",
ISBN = "4-87378-284-8",
ISBN-13 = "978-4-87378-284-3",
LCCN = "QA76.88.I1991",
bibdate = "Fri Aug 30 08:01:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Supercomputers --- Congresses",
editor = "{USENIX}",
booktitle = "Proceedings of the {USENIX} Mach Symposium: November
20--22, 1991, Monterey, California, USA",
title = "Proceedings of the {USENIX} Mach Symposium: November
20--22, 1991, Monterey, California, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "262",
year = "1991",
LCCN = "QAX 27",
bibdate = "Sun Feb 18 07:46:09 MST 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
keywords = "Memory management (Computer science) --- Congresses;
Operating systems (Computers) --- Congresses; UNIX
(Computer file) --- Congresses",
editor = "{USENIX}",
key = "USENIX-WINTER'91",
booktitle = "Proceedings of the Winter 1991 {USENIX} Conference:
January 21--January 25, 1991, Dallas, {TX}, {USA}",
title = "Proceedings of the Winter 1991 {USENIX} Conference:
January 21--January 25, 1991, Dallas, {TX}, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "ix + 363",
year = "1991",
LCCN = "QA 76.76 O63 U84 1992",
bibdate = "Mon Jul 18 12:14:50 1994",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
keywords = "Computer networks --- Congresses; Operating systems
(Computers) --- Congresses; Programming (Electronic
computers) --- Congresses; UNIX (Computer file) ---
editor = "Stephen M. Watt",
booktitle = "ISSAC '91: proceedings of the 1991 International
Symposium on Symbolic and Algebraic Computation, July
15--17, 1991, Bonn, Germany",
title = "{ISSAC} '91: proceedings of the 1991 International
Symposium on Symbolic and Algebraic Computation, July
15--17, 1991, Bonn, Germany",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "xiii + 468",
year = "1991",
ISBN = "0-89791-437-6",
ISBN-13 = "978-0-89791-437-6",
LCCN = "QA 76.95 I59 1991",
bibdate = "Thu Sep 26 06:00:06 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dirac-p-a-m.bib;
abstract = "The following topics were dealt with: algorithms for
symbolic mathematical computation; languages, systems
and packages; computational geometry, group theory and
number theory; automatic theorem proving and
programming; interface of symbolics, numerics and
graphics; applications in mathematics, science and
engineering; and symbolic and algebraic computation in
acknowledgement = ack-nhfb,
classification = "C1160 (Combinatorial mathematics); C4130
(Interpolation and function approximation); C4210
(Formal logic); C4240 (Programming and algorithm
theory); C7310 (Mathematics)",
confdate = "15--17 July 1991",
conflocation = "Bonn, Germany",
confsponsor = "ACM",
keywords = "algebra --- data processing --- congresses; Algebraic
computation; Algorithms; Automatic theorem proving;
Computational geometry; Education; Engineering;
Graphics; Group theory; Languages; Mathematics;
mathematics --- data processing --- congresses; Number
theory; Programming; Science; Symbolic mathematical
computation; Symbolics",
pubcountry = "USA",
thesaurus = "Computational complexity; Formal languages;
Interpolation; Number theory; Polynomials; Symbol
editor = "{ACM}",
booktitle = "Conference proceedings / 1992 International Conference
on Supercomputing, July 19--23, 1992, Washington, DC",
title = "Conference proceedings / 1992 International Conference
on Supercomputing, July 19--23, 1992, Washington,
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "x + 485",
year = "1992",
ISBN = "0-89791-485-6 (paperback), 0-89791-486-4",
ISBN-13 = "978-0-89791-485-7 (paperback), 978-0-89791-486-4",
LCCN = "QA 76.88 I57 1992",
bibdate = "Wed Aug 28 06:48:31 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
University of California MELVYL catalog.",
note = "Sponsored by ACM SIGARCH.",
acknowledgement = ack-nhfb,
keywords = "supercomputers --- congresses",
editor = "{IEEE Computer Society. Technical Committee on
Computer Architecture}",
booktitle = "Proceedings, Supercomputing '92: Minneapolis,
Minnesota, November 16-20, 1992",
title = "Proceedings, Supercomputing '92: Minneapolis,
Minnesota, November 16-20, 1992",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xxiv + 848",
year = "1992",
ISBN = "0-8186-2632-1 (case), 0-8186-2630-5 (paper),
0-8186-2631-3 (microfiche), 0-89791-537-2 (ACM Library
ISBN-13 = "978-0-8186-2632-6 (case), 978-0-8186-2630-2 (paper),
978-0-8186-2631-9 (microfiche), 978-0-89791-537-3 (ACM
Library series)",
LCCN = "QA76.5 .S894 1992",
bibdate = "Wed Aug 28 06:48:31 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
University of California MELVYL catalog.",
note = "Cover title: Supercomputing '91. ACM order number
415922. IEEE Computer Society Press order number 2630
IEEE catalog number 92CH3216-9.",
acknowledgement = ack-nhfb,
keywords = "artificial intelligence; biosciences; cache;
compiling; distributed computing; fluids; industrial
modeling; instruction-level optimization;
interconnections; massively parallel systems;
multiprocessing programs; multiprocessing systems;
numerical applications; parallel algorithms; parallel
programming; parallelizing transformations; particles;
performance evaluation; performance methodology;
register efficiency; scheduling; sparse matrix
algorithms; supercomputers --- congresses; symbolic
algorithms; waves",
editor = "{USENIX}",
booktitle = "Proceedings of the Summer 1992 {USENIX} Conference:
June 8--12, 1992, San Antonio, Texas, USA",
title = "Proceedings of the Summer 1992 {USENIX} Conference:
June 8--12, 1992, San Antonio, Texas, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "vii + 253",
month = "Summer",
year = "1992",
ISBN = "1-880446-44-8",
ISBN-13 = "978-1-880446-44-7",
LCCN = "QA 76.76 O63 U83 1992",
bibdate = "Wed Aug 13 10:48:45 MDT 1997",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
annote = "Spine title: San Antonio conference proceedings.",
keywords = "UNIX (Computer operating system) --- Congresses",
location = "San Antonio, TX",
editor = "{USENIX}",
booktitle = "Symposium on Experiences with Distributed and
Multiprocessor Systems (SEDMS III), March 26--27, 1992.
Newport Beach, CA",
title = "Symposium on Experiences with Distributed and
Multiprocessor Systems ({SEDMS III}), March 26--27,
1992. Newport Beach, {CA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "326",
day = "26--27",
month = mar,
year = "1992",
ISBN = "1-880446-41-3",
ISBN-13 = "978-1-880446-41-6",
LCCN = "QA76.9.D3 S954 1992",
bibdate = "Wed Oct 16 13:53:39 2002",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
location = "Newport Beach, CA",
editor = "{ACM}",
key = "ACM SIGPLAN POPL '93",
booktitle = "Conference record of the Twentieth Annual {ACM}
{SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the symposium,
{Charleston, South Carolina}, {January} 10--13, 1993",
title = "Conference record of the Twentieth Annual {ACM}
{SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the symposium,
{Charleston, South Carolina}, {January} 10--13, 1993",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "viii + 510",
year = "1993",
ISBN = "0-89791-560-7 (soft cover), 0-89791-561-5 (series hard
ISBN-13 = "978-0-89791-560-1 (soft cover), 978-0-89791-561-8
(series hard cover)",
LCCN = "QA76.7 .A15 1993",
bibdate = "Mon May 03 18:38:48 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order number 549930.",
URL = "http://www.acm.org/pubs/contents/proceedings/plan/158511/index.html",
acknowledgement = ack-nhfb,
classification = "C4210 (Formal logic); C4240 (Programming and
algorithm theory); C6110 (Systems analysis and
programming); C6140D (High level languages); C6150C
(Compilers, interpreters and other processors); C6170
(Expert systems)",
confdate = "10-13 Jan. 1993",
conflocation = "Charleston, SC, USA",
confsponsor = "ACM",
keywords = "Compilers; Computational complexity; electronic
digital computers --- programming --- congresses;
Functional programming; Lambda calculus; Lazy
evaluation; Logic programming; Object-oriented
languages; Parallel computing; Parametricity;
Polymorphism; Program testing/debugging; Programming
language principles; programming languages (electronic
computers) --- congresses; Register allocation; Typed
thesaurus = "Computational complexity; High level languages; Lambda
calculus; Program compilers; Programming; Programming
theory; Storage allocation",
editor = "{ACM}",
booktitle = "{Proceedings of the twenty-fifth annual ACM Symposium
on the Theory of Computing, San Diego, California, May
16--18, 1993}",
title = "{Proceedings of the twenty-fifth annual ACM Symposium
on the Theory of Computing, San Diego, California, May
16--18, 1993}",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "ix + 812",
year = "1993",
ISBN = "0-89791-591-7",
ISBN-13 = "978-0-89791-591-5",
LCCN = "QA 76.6 A13 1993",
bibdate = "Thu Dec 3 07:11:18 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order no. 508930.",
acknowledgement = ack-nhfb,
keywords = "computational complexity --- congresses",
editor = "ACM",
booktitle = "TRI-Ada '93: Conference --- September 1993, Seattle,
title = "{TRI}-Ada '93: Conference --- September 1993, Seattle,
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "vii + 482",
year = "1993",
ISBN = "0-89791-621-2",
ISBN-13 = "978-0-89791-621-9",
LCCN = "????",
bibdate = "Thu Sep 04 12:56:10 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM Order No. 825930.",
series = "TRIADA -proceedings- 1993",
acknowledgement = ack-nhfb,
sponsor = "Association for Computing Machinery; SIGAda.",
editor = "{IEEE}",
key = "Supercomputing'93",
booktitle = "Proceedings, Supercomputing '93: Portland, Oregon,
November 15--19, 1993",
title = "Proceedings, Supercomputing '93: Portland, Oregon,
November 15--19, 1993",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xxii + 935",
year = "1993",
ISBN = "0-8186-4340-4 (paperback), 0-8186-4341-2 (microfiche),
0-8186-4342-0 (hardback), 0-8186-4346-3 (CD-ROM)",
ISBN-13 = "978-0-8186-4340-8 (paperback), 978-0-8186-4341-5
(microfiche), 978-0-8186-4342-2 (hardback),
978-0-8186-4346-0 (CD-ROM)",
ISSN = "1063-9535",
LCCN = "QA76.5 .S96 1993",
bibdate = "Mon Jan 15 11:06:21 1996",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
acknowledgement = ack-nhfb,
classification = "631.1; 722.1; 722.3; 722.4; 723.2; 921.6",
keywords = "Algorithms; Cache coherence; Clustered workstations;
Computer graphics; Computer networks; Computer
programming languages; Data parallel compilers; Data
partitioning; Distributed computer systems; Eigenvalues
and eigenfunctions; Finite element method; Flow
visualization; Fluid mechanics; Linear algebra; Mass
storage; Massively parallel processors; Natural
sciences computing; Parallel languages; Parallel
processing systems; Parallel rendering; Program
compilers; Quantum theory; Scheduling; Sparse matrices;
sponsor = "Institute of Electrical and Electronics Engineers;
Computer Society. Association for Computing Machinery;
editor = "{USENIX}",
booktitle = "Proceedings of the {USENIX} Mobile and
Location-Independent Computing Symposium: August 2--3,
1993, Cambridge, Massachusetts, USA",
title = "Proceedings of the {USENIX} Mobile and
Location-Independent Computing Symposium: August 2--3,
1993, Cambridge, Massachusetts, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "138",
year = "1993",
ISBN = "1-880446-51-0",
ISBN-13 = "978-1-880446-51-5",
LCCN = "QA 76.76 O63 U86 1993",
bibdate = "Tue Oct 22 08:33:21 2002",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://www.usenix.org/publications/library/proceedings/mobile93/",
acknowledgement = ack-nhfb,
annote = "Spine title: Mobile and Location-Independent Computing
Symposium, Summer 1993.",
keywords = "Computer networks --- Congresses; Portable computers
--- Communication systems --- Congresses; UNIX
(Computer file) --- Congresses",
editor = "{USENIX}",
booktitle = "Proceedings of the Winter 1993 {USENIX} Conference:
January 25--29, 1993, San Diego, California, {USA}",
title = "Proceedings of the Winter 1993 {USENIX} Conference:
January 25--29, 1993, San Diego, California, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "x + 530",
year = "1993",
ISBN = "1-880446-48-0",
ISBN-13 = "978-1-880446-48-5",
LCCN = "QA 76.76 O63 U84 1993",
bibdate = "Sun Feb 18 07:46:09 MST 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://www.usenix.org/publications/library/proceedings/sd93/",
acknowledgement = ack-nhfb,
annote = "Spine title: USENIX San Diego conference proceedings,
winter 1993. Running title: 1993 winter USENIX, January
25--29, 1993, San Diego, CA.",
keywords = "Computer networks --- Congresses; Operating systems
(Computers) --- Congresses; Programming (Electronic
computers) --- Congresses; UNIX (Computer file) ---
editor = "{ACM}",
booktitle = "{ACM SIGPLAN '94 Conference on Programming Language
Design and Implementation (PLDI). Orlando, FL, USA,
20--24 June, 1994}",
title = "{ACM SIGPLAN '94 Conference on Programming Language
Design and Implementation (PLDI). Orlando, FL, USA,
20--24 June, 1994}",
volume = "29(6)",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "360",
month = jun,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Apr 24 18:36:02 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = j-SIGPLAN,
acknowledgement = ack-nhfb,
classification = "C4240 (Programming and algorithm theory); C6110
(Systems analysis and programming); C6140D (High level
languages); C6150C (Compilers, interpreters and other
processors); C6150G (Diagnostic, testing, debugging and
evaluating systems)",
conftitle = "ACM SIGPLAN '94 Conference on Programming Language
Design and Implementation (PLDI)",
keywords = "address calculation; array access errors;
backtracking; cache performance; CLP; code replication;
compilation techniques; continuation passing; garbage
collected programs; high level languages; jump
debugging; jump statements; lazy functional state
threads; link-time optimisation; memory access
coalescing; optimal tracing; optimisation; partial dead
code elimination; pointer-based data structures;
Presburger Formulas; program analysis tools; program
compilers; program debugging; program optimisation;
program structure tree; programming; programming
language design; programming theory; programming theory
program debugging; Prolog; register allocation; slicing
programs; Standard ML; type analysis; zero-cost range
sponsororg = "ACM",
treatment = "P Practical; T Theoretical or Mathematical",
editor = "{ACM}",
booktitle = "Conference record of {POPL} '94, 21st {ACM
SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the Symposium: Portland,
Oregon, January 17--21, 1994",
title = "Conference record of {POPL} '94, 21st {ACM
SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the Symposium: Portland,
Oregon, January 17--21, 1994",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "viii + 492",
year = "1994",
ISBN = "0-89791-636-0",
ISBN-13 = "978-0-89791-636-3",
LCCN = "QA76.7 .A15 1994",
bibdate = "Sat Sep 7 07:51:54 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.acm.org/pubs/contents/proceedings/plan/174675/index.html",
abstract = "The following topics were dealt with: programming
language principles; OOP; type theory; program
correctness; lambda calculus; garbage collection; logic
programming; scheduling; data flow graphs; functional
programming; and continuation passing.",
acknowledgement = ack-nhfb,
classification = "C4210 (Formal logic); C4240 (Programming and
algorithm theory); C6110J (Object-oriented
programming); C6120 (File organisation); C6140D (High
level languages); C6150C (Compilers, interpreters and
other processors)",
confdate = "17--21 Jan. 1994",
conflocation = "Portland, OR, USA",
confsponsor = "ACM",
keywords = "Continuation passing; Data flow graphs; Functional
programming; Garbage collection; Lambda calculus; Logic
programming; OOP; Program correctness; Programming
language principles; Scheduling; Type theory",
thesaurus = "High level languages; Lambda calculus; Object-oriented
programming; Program compilers; Program verification;
Storage management; Type theory",
editor = "{ACM}",
booktitle = "{ISSAC '94: Proceedings of the 1994 International
Symposium on Symbolic and Algebraic Computation: July
20--22, 1994, Oxford, England, United Kingdom}",
title = "{ISSAC '94: Proceedings of the 1994 International
Symposium on Symbolic and Algebraic Computation: July
20--22, 1994, Oxford, England, United Kingdom}",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "ix + 359",
year = "1994",
ISBN = "0-89791-638-7",
ISBN-13 = "978-0-89791-638-7",
LCCN = "QA76.95.I59 1994",
bibdate = "Thu Sep 26 05:45:15 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
confdate = "20--22 July 1994",
conflocation = "Oxford, UK",
confsponsor = "ACM",
pubcountry = "USA",
editor = "ACM",
booktitle = "{Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
(ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}",
title = "{Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
(ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}",
volume = "29(11)",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "328",
month = nov,
year = "1994",
ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160
ISSN-L = "0362-1340",
bibdate = "Fri Apr 24 18:36:02 MDT 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = j-SIGPLAN,
acknowledgement = ack-nhfb,
classification = "C5220 (Computer architecture); C6140 (Programming
languages); C6150J (Operating systems)",
conflocation = "",
conftitle = "Sixth International Conference on Architectural
Support for Programming Languages and Operating Systems
keywords = "architectural support; code transformation; computer
architecture; instrumentation; measurement; memory
access; multithreading; operating systems; operating
systems (computers); parallel machines; programming
languages; shares memory multiprocessors; uniprocessor
sponsororg = "ACM; IEEE Comput. Soc",
editor = "Anonymous",
booktitle = "1994 International Computer Symposium Conference
title = "1994 International Computer Symposium Conference
publisher = "Nat. Chiao Tung Univ",
address = "Hsinchu, Taiwan",
pages = "xvi + 1310",
year = "1994",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Sun Dec 22 10:19:23 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "2 vol.",
acknowledgement = ack-nhfb,
confdate = "12--15 Dec. 1994",
conflocation = "Hsinchu, Taiwan",
confsponsor = "Ministr. Educ.; Comput. Soc",
pubcountry = "Taiwan",
editor = "Anonymous",
booktitle = "Proceedings of the 2nd International World Wide Web
conference, Mosaic and the Web, October 1994,
Ramada-Congress Hotel, 520 South Michigan Avenue,
Chicago, IL",
title = "Proceedings of the 2nd International World Wide Web
conference, Mosaic and the Web, October 1994,
Ramada-Congress Hotel, 520 South Michigan Avenue,
Chicago, {IL}",
volume = "18(6)",
publisher = pub-LEARNED-INF,
address = pub-LEARNED-INF:adr,
pages = "????",
year = "1994",
ISSN = "0309-314X",
bibdate = "Sun Oct 22 08:43:14 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = j-ONLINE-CDROM-REV,
URL = "http://www.ncsa.uiuc.edu/SDG/IT94/Proceedings/WWW2_Proceedings.html",
acknowledgement = ack-nhfb,
editor = "Anonymous",
booktitle = "USENIX Summer conference: --- June 1994, Boston, MA",
title = "{USENIX} Summer conference: -- June 1994, Boston,
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "316",
year = "1994",
ISBN = "1-880446-62-6",
ISBN-13 = "978-1-880446-62-1",
LCCN = "QA 76.76 O63 U83 1994",
bibdate = "Sat May 25 07:59:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "USENIX Conference Proceedings 1994",
acknowledgement = ack-nhfb,
editor = "Shafi Goldwasser",
booktitle = "Proceedings: 35th Annual Symposium on Foundations of
Computer Science, November 20--22, 1994, Santa Fe, New
title = "Proceedings: 35th Annual Symposium on Foundations of
Computer Science, November 20--22, 1994, Santa Fe, New
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xiii + 837",
year = "1994",
ISBN = "0-8186-6582-3",
ISBN-13 = "978-0-8186-6582-0",
ISSN = "0272-5428",
LCCN = "QA 76 S979 1994",
bibdate = "Thu Dec 3 07:11:18 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "IEEE catalog number 94CH35717. IEEE Computer Society
Press Order Number 6580-02.",
acknowledgement = ack-nhfb,
keywords = "electronic data processing --- congresses",
editor = "Hoon Hong",
booktitle = "{First International Symposium on Parallel Symbolic
Computation, PASCO '94, Hagenberg\slash Linz, Austria,
September 26--28, 1994}",
title = "{First International Symposium on Parallel Symbolic
Computation, PASCO '94, Hagenberg\slash Linz, Austria,
September 26--28, 1994}",
volume = "5",
publisher = pub-WORLD-SCI,
address = pub-WORLD-SCI:adr,
pages = "xiii + 431",
year = "1994",
ISBN = "981-02-2040-5",
ISBN-13 = "978-981-02-2040-2",
LCCN = "QA76.642.I58 1994",
bibdate = "Thu Mar 12 07:55:38 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib;
series = "Lecture notes series in computing",
acknowledgement = ack-nhfb,
alttitle = "Parallel symbolic computation",
keywords = "Parallel programming (Computer science) ---
editor = "{IEEE}",
booktitle = "Proceedings 11th IEEE Workshop on Real-Time Operating
Systems and Software. RTOSS '94, Seattle, WA, USA,
18--19 May 1994",
title = "Proceedings 11th {IEEE} Workshop on Real-Time
Operating Systems and Software. {RTOSS} '94, Seattle,
{WA}, {USA}, 18--19 May 1994",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "viii + 117",
year = "1994",
ISBN = "0-8186-5710-3",
ISBN-13 = "978-0-8186-5710-8",
LCCN = "QA76.54.I173 1994",
bibdate = "Sat Sep 28 18:52:45 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
note = "IEEE catalog number 94TH0639-5.",
acknowledgement = ack-nhfb,
confsponsor = "IEEE",
editor = "{IEEE}",
booktitle = "{Proceedings of the Scalable High-Performance
Computing Conference, May 23--25, 1994, Knoxville,
title = "{Proceedings of the Scalable High-Performance
Computing Conference, May 23--25, 1994, Knoxville,
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xviii + 852",
year = "1994",
ISBN = "0-8186-5680-8, 0-8186-5681-6",
ISBN-13 = "978-0-8186-5680-4, 978-0-8186-5681-1",
LCCN = "QA76.5 .S244 1994",
bibdate = "Mon Aug 26 10:38:41 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "IEEE catalog number 94TH0637-9.",
acknowledgement = ack-nhfb,
sponsor = "IEEE Computer Society; Technical Committee on
Supercomputing Applications.",
editor = "{IEEE}",
booktitle = "{Proceedings, Supercomputing '94: Washington, DC,
November 14--18, 1994}",
title = "{Proceedings, Supercomputing '94: Washington, DC,
November 14--18, 1994}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xvii + 823",
year = "1994",
ISBN = "0-8186-6605-6 (paper), 0-8186-6606-4 (microfiche),
0-8186-6607-2 (case)",
ISBN-13 = "978-0-8186-6605-6 (paper), 978-0-8186-6606-3
(microfiche), 978-0-8186-6607-0 (case)",
ISSN = "1063-9535",
LCCN = "QA76.5 .S894 1994",
bibdate = "Fri Aug 30 08:01:51 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "IEEE catalog number 94CH34819.",
series = "Supercomputing",
acknowledgement = ack-nhfb,
keywords = "Supercomputers --- Congresses",
sponsor = "IEEE.",
editor = "IEEE",
booktitle = "Real-time operating systems and software: RTOSS '94:
11th Workshop --- May 1994, Seattle, WA",
title = "Real-time operating systems and software: {RTOSS} '94:
11th Workshop --- May 1994, Seattle, {WA}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "viii + 117",
year = "1994",
ISBN = "0-8186-5710-3",
ISBN-13 = "978-0-8186-5710-8",
LCCN = "QA76.54.I173 1994",
bibdate = "Sat May 25 07:59:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "IEEE Workshop on Real Time Operating Systems and
Software 1994; 11th",
acknowledgement = ack-nhfb,
sponsor = "IEEE; Computer Society; Technical Committee on
Real-Time Systems.",
editor = "ACM",
booktitle = "Conference proceedings of the 1995 International
Conference on Supercomputing, Barcelona, Spain, July
3--7, 1995",
title = "Conference proceedings of the 1995 International
Conference on Supercomputing, Barcelona, Spain, July
3--7, 1995",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "xii + 448",
year = "1995",
ISBN = "0-89791-728-6",
ISBN-13 = "978-0-89791-728-5",
LCCN = "QA 76.88 I57 1995",
bibdate = "Mon Dec 23 18:50:57 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = "Conference Proceedings of the International Conference
on Supercomputing",
acknowledgement = ack-nhfb,
sponsor = "Association for Computing Machinery. Special Interest
Group on Computer Architecture.",
editor = "{ACM}",
booktitle = "Conference record of {POPL} '95, 22nd {ACM}
{SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the Symposium: San
Francisco, California, January 22--25, 1995",
title = "Conference record of {POPL} '95, 22nd {ACM}
{SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the Symposium: San
Francisco, California, January 22--25, 1995",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "vii + 408",
year = "1995",
ISBN = "0-89791-692-1",
ISBN-13 = "978-0-89791-692-9",
LCCN = "QA 76.7 A11 1995",
bibdate = "Mon May 3 17:47:49 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order number: 549950.",
URL = "http://www.acm.org/pubs/contents/proceedings/plan/199448/index.html",
acknowledgement = ack-nhfb,
alttitle = "Proceedings, 22nd ACM SIGPLAN-SIGACT Symposium on
Principles of Programming Languages POPL '95",
annote = "Sponsored by the Association for Computing Machinery,
Special Interest Group on Algorithms and Computation
Theory (SIGACT), Special Interest Group on Programming
Languages (SIGPLAN).",
keywords = "Programming languages (Electronic computers) --
editor = "Afonso Ferreira and Jose Rolim",
booktitle = "{Parallel algorithms for irregularly structured
problems: second international workshop, IRREGULAR 95,
Lyon, France, September, 4--6, 1995: proceedings}",
title = "{Parallel algorithms for irregularly structured
problems: second international workshop, IRREGULAR 95,
Lyon, France, September, 4--6, 1995: proceedings}",
publisher = pub-SV,
address = pub-SV:adr,
pages = "x + 409",
year = "1995",
ISBN = "3-540-60321-2",
ISBN-13 = "978-3-540-60321-4",
LCCN = "QA76.642.I59 1995",
bibdate = "Sun Dec 22 10:19:23 MST 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
confsponsor = "IFIP",
pubcountry = "Germany",
editor = "{IEEE Computer Society. Technical Committee on
Computer Communications}",
booktitle = "Proceedings: 20th Conference on Local Computer
Networks, October 16--19, 1995, Minneapolis,
title = "Proceedings: 20th Conference on Local Computer
Networks, October 16--19, 1995, Minneapolis,
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xii + 496",
year = "1995",
ISBN = "0-8186-7163-7 (microfiche), 0-8186-7162-9",
ISBN-13 = "978-0-8186-7163-0 (microfiche), 978-0-8186-7162-3",
LCCN = "TK5105.7 .C66 1995 Bar",
bibdate = "Mon Sep 27 06:55:07 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "IEEE Computer Society Press order number PR07162. IEEE
catalog number 95TB100005",
acknowledgement = ack-nhfb,
keywords = "local area networks (computer networks) --
editor = "{ACM}",
booktitle = "FCRC '96: Conference proceedings of the 1996
International Conference on Supercomputing:
Philadelphia, Pennsylvania, {USA}, May 25--28, 1996",
title = "{FCRC} '96: Conference proceedings of the 1996
International Conference on Supercomputing:
Philadelphia, Pennsylvania, {USA}, May 25--28, 1996",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "xii + 406",
year = "1996",
ISBN = "0-89791-803-7",
ISBN-13 = "978-0-89791-803-9",
LCCN = "QA76.5 I61 1996",
bibdate = "Wed Mar 18 12:33:29 MST 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order number 415961.",
acknowledgement = ack-nhfb,
keywords = "Supercomputers --- Congresses.",
editor = "{IEEE}",
booktitle = "Proceedings. Second MPI Developer's Conference: Notre
Dame, IN, USA, 1--2 July 1996",
title = "Proceedings. Second {MPI} Developer's Conference:
Notre Dame, {IN}, {USA}, 1--2 July 1996",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "ix + 207",
year = "1996",
ISBN = "0-8186-7533-0",
ISBN-13 = "978-0-8186-7533-1",
LCCN = "QA76.642 .M67 1996",
bibdate = "Tue May 12 08:56:04 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed
editor = "{Lakshman Y. N.}",
booktitle = "{ISSAC '96: Proceedings of the 1996 International
Symposium on Symbolic and Algebraic Computation, July
24--26, 1996, Zurich, Switzerland}",
title = "{ISSAC '96: Proceedings of the 1996 International
Symposium on Symbolic and Algebraic Computation, July
24--26, 1996, Zurich, Switzerland}",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "xvii + 313",
year = "1996",
ISBN = "0-89791-796-0",
ISBN-13 = "978-0-89791-796-4",
LCCN = "QA 76.95 I59 1996",
bibdate = "Thu Mar 12 08:00:14 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib;
acknowledgement = ack-nhfb,
sponsor = "ACM; Special Interest Group in Symbolic and Algebraic
Manipulation (SIGSAM). ACM; Special Interest Group on
Numerical Mathematics (SIGNUM).",
editor = "Boleslaw K. Szymanski and Balaram Sinharoy",
booktitle = "Languages, Compilers and Run-Time Systems for Scalable
Computers, Troy, NY, USA, May 22--24, 1995",
title = "Languages, Compilers and Run-Time Systems for Scalable
Computers, Troy, {NY}, {USA}, May 22--24, 1995",
publisher = pub-KLUWER,
address = pub-KLUWER:adr,
pages = "xiv + 335",
year = "1996",
ISBN = "0-7923-9635-9",
ISBN-13 = "978-0-7923-9635-2",
LCCN = "QA76.58.L37 1996",
bibdate = "Sat Sep 28 18:12:58 MDT 1996",
bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib;
acknowledgement = ack-nhfb,
editor = "{USENIX} Association",
booktitle = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996.
Monterey, CA",
title = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996.
Monterey, {CA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "????",
day = "10--13",
month = jul,
year = "1996",
ISBN = "1-880446-78-2",
ISBN-13 = "978-1-880446-78-2",
LCCN = "QA76.73.T44 T44 1996",
bibdate = "Fri Oct 18 07:24:24 MDT 1996",
bibsource = "ftp://ftp.uu.net/library/bibliography;
acknowledgement = ack-nhfb,
location = "Monterey, CA",
editor = "{USENIX}",
booktitle = "Proceedings of the fourth annual Tcl\slash Tk
Workshop, July 10--13, 1996, Monterey, California",
title = "Proceedings of the fourth annual Tcl\slash Tk
Workshop, July 10--13, 1996, Monterey, California",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "235",
year = "1996",
ISBN = "1-880446-78-2",
ISBN-13 = "978-1-880446-78-2",
LCCN = "QA 76.73 T44 T35 1996",
bibdate = "Mon May 11 11:50:25 1998",
bibsource = "ftp://ftp.uu.net/library/bibliography;
URL = "http://www.usenix.org/publications/library/proceedings/tcl96/",
acknowledgement = ack-nhfb,
location = "Monterey, CA",
editor = "{IEEE}",
booktitle = "Advances in parallel and distributed computing: March
19--21, 1997, Shanghai, China: proceedings",
title = "Advances in parallel and distributed computing: March
19--21, 1997, Shanghai, China: proceedings",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "xii + 426",
year = "1997",
ISBN = "0-8186-7876-3 (paperback and case), 0-8186-7878-X
ISBN-13 = "978-0-8186-7876-9 (paperback and case),
978-0-8186-7878-3 (microfiche)",
LCCN = "QA76.58 .A4 1997",
bibdate = "Wed Apr 16 07:34:31 MDT 1997",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "electronic data processing -- distributed processing
-- congresses; parallel processing (electronic
computers) -- congresses",
editor = "{ACM}",
booktitle = "ACM 1998 Workshop on Java for High-Performance Network
title = "{ACM} 1998 Workshop on Java for High-Performance
Network Computing",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "????",
year = "1998",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Thu Apr 27 10:40:59 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "Possibly unpublished, except electronically.",
URL = "http://www.cs.ucsb.edu/conferences/java98/program.html",
acknowledgement = ack-nhfb,
editor = "ACM",
booktitle = "Conference record of POPL '98: the 25th ACM
SIGPLAN-SIGACT Symposium on Principles of Programming
Languages: papers presented at the Symposium, San
Diego, California, 19--21 January 1998",
title = "Conference record of {POPL} '98: the 25th {ACM}
{SIGPLAN-SIGACT} Symposium on Principles of Programming
Languages: papers presented at the Symposium, San
Diego, California, 19--21 January 1998",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "viii + 408",
year = "1998",
ISBN = "0-89791-979-3",
ISBN-13 = "978-0-89791-979-1",
LCCN = "QA76.7 .A15 1998",
bibdate = "Mon May 3 17:47:49 MDT 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
note = "ACM order number: 549981.",
URL = "http://www.acm.org/pubs/contents/proceedings/plan/268946/index.html",
acknowledgement = ack-nhfb,
alttitle = "POPL '98 ACM SIGPLAN-SIGACT Symposium on Principles of
Programming Languages Principles of programming
languages Proceedings 25th ACM SIGPLAN-SIGACT Symposium
on Principles of Programming Languages",
keywords = "Electronic digital computers -- Programming --
Congresses.; Programming languages (Electronic
computers) -- Congresses.",
editor = "{ACM}",
booktitle = "{Proceedings: the 25th Annual International Symposium
on Computer Architecture, June 27--July 1, 1998,
Barcelona, Spain}",
title = "{Proceedings: the 25th Annual International Symposium
on Computer Architecture, June 27--July 1, 1998,
Barcelona, Spain}",
volume = "26(3)",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "xiii + 394",
year = "1998",
ISBN = "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3",
ISBN-13 = "978-0-8186-8491-3, 978-0-8186-8492-0,
LCCN = "QA76.9.A73 S97 1998",
bibdate = "Fri May 12 12:36:10 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ACM Order Number 414984. IEEE Computer Society Order
Number PR08491; IEEE Order Plan Catalog Number
series = "Computer architecture news",
URL = "http://portal.acm.org/toc.cfm?id=279358;
acknowledgement = ack-nhfb,
remark = "ISCA '25 proceedings.",
editor = "{ACM}",
booktitle = "SC'98: High Performance Networking and Computing:
Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
Orange County Convention Center, Orlando, Florida, USA,
November 7--13, 1998",
title = "{SC}'98: High Performance Networking and Computing:
Proceedings of the 1998 {ACM}\slash {IEEE} {SC98}
Conference: Orange County Convention Center, Orlando,
Florida, {USA}, November 7--13, 1998",
publisher = pub-ACM # " and " # pub-IEEE,
address = pub-ACM:adr # " and " # pub-IEEE:adr,
pages = "????",
year = "1998",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Wed Oct 07 08:51:34 1998",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.supercomp.org/sc98/papers/",
acknowledgement = ack-nhfb,
editor = "{USENIX}",
booktitle = "Proceedings of the sixth annual Tcl/Tk Conference,
September 18--24 [i.e. 14--18], 1998, San Diego,
title = "Proceedings of the sixth annual Tcl/Tk Conference,
September 18--24 [i.e. 14--18], 1998, San Diego,
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "206",
year = "1998",
ISBN = "1-880446-98-7",
ISBN-13 = "978-1-880446-98-0",
LCCN = "QA76.73.T44 T34 1998",
bibdate = "Fri Oct 18 08:12:11 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://db.usenix.org/publications/library/proceedings/tcl98/",
acknowledgement = ack-nhfb,
editor = "{USENIX}",
booktitle = "Proceedings of the 2nd {USENIX Windows NT} Symposium:
August 3--5, 1998, Seattle, Washington",
title = "Proceedings of the 2nd {USENIX Windows NT} Symposium:
August 3--5, 1998, Seattle, Washington",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "173",
year = "1998",
ISBN = "1-880446-95-2",
ISBN-13 = "978-1-880446-95-9",
LCCN = "QA76.76.O63 U885 1998",
bibdate = "Fri Oct 29 08:40:21 1999",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://db.usenix.org/publications/library/proceedings/usenix-nt98",
acknowledgement = ack-nhfb,
editor = "ACM",
booktitle = "Proceedings of the ACM SIGPLAN '99 Conference on
Programming Language Design and Implementation (PLDI
'99), Atlanta, Georgia, 2--4 May 1999",
title = "Proceedings of the {ACM} {SIGPLAN} '99 Conference on
Programming Language Design and Implementation ({PLDI}
'99), Atlanta, Georgia, 2--4 May 1999",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "????",
year = "1999",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Thu May 13 14:45:29 1999",
bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html;
acknowledgement = ack-nhfb,
editor = "{ACM}",
booktitle = "SC'99: Oregon Convention Center 777 NE Martin Luther
King Jr. Boulevard, Portland, Oregon, November 11--18,
title = "{SC}'99: Oregon Convention Center 777 {NE} Martin
Luther King Jr. Boulevard, Portland, Oregon, November
11--18, 1999",
publisher = pub-ACM # " and " # pub-IEEE,
address = pub-ACM:adr # " and " # pub-IEEE:adr,
pages = "????",
year = "1999",
ISBN = "",
ISBN-13 = "",
LCCN = "",
bibdate = "Thu Feb 24 09:35:00 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
editor = "Malcolm P. Atkinson and Maria E. Orlowska and Patrick
Valduriez and Stanley B. Zdonik and Michael L. Brodie",
booktitle = "Proceedings of the Twenty-fifth International
Conference on Very Large Databases, Edinburgh,
Scotland, UK, 7--10 September, 1999",
title = "Proceedings of the Twenty-fifth International
Conference on Very Large Databases, Edinburgh,
Scotland, {UK}, 7--10 September, 1999",
publisher = pub-MORGAN-KAUFMANN,
address = pub-MORGAN-KAUFMANN:adr,
pages = "xviii + 761",
year = "1999",
ISBN = "1-55860-615-7",
ISBN-13 = "978-1-55860-615-9",
LCCN = "QA76.9.D3 I559 1999",
bibdate = "Tue Oct 24 18:36:50 MDT 2000",
bibsource = "DBLP; http://dblp.uni-trier.de;
https://www.math.utah.edu/pub/tex/bib/vldb.bib; OCLC
Proceedings database",
note = "Also known as VLDB'99",
acknowledgement = ack-nhfb,
keywords = "very large data bases; VLDB",
editor = "J. J. Dongarra and E. Luque and Tomas Margalef",
booktitle = "{Recent advances in parallel virtual machine and
message passing interface: 6th European PVM\slash {MPI}
Users' Group Meeting, Barcelona, Spain, September
26--29, 1999: Proceedings}",
title = "{Recent advances in parallel virtual machine and
message passing interface: 6th European PVM\slash {MPI}
Users' Group Meeting, Barcelona, Spain, September
26--29, 1999: Proceedings}",
volume = "1697",
publisher = pub-SV,
address = pub-SV:adr,
pages = "xvii + 551",
year = "1999",
DOI = "????",
ISBN = "3-540-66549-8 (softcover)",
ISBN-13 = "978-3-540-66549-6 (softcover)",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
LCCN = "QA76.58 E973 1999",
bibdate = "Wed Dec 8 06:34:56 MST 1999",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
series = ser-LNCS,
URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t1697.htm;
acknowledgement = ack-nhfb,
alttitle = "PVM\slash MPI '99",
keywords = "Data transmission systems; Parallel computers; Virtual
computer systems",
editor = "IEEE",
booktitle = "Hot Chips 11: Stanford University, Stanford,
California, August 15--17, 1999",
title = "Hot Chips 11: Stanford University, Stanford,
California, August 15--17, 1999",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "????",
year = "1999",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Mon Jan 08 05:26:43 2001",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
URL = "http://www.hotchips.org/hotc11_index.html",
acknowledgement = ack-nhfb,
editor = "{ACM}",
booktitle = "SC2000: High Performance Networking and Computing.
Dallas Convention Center, Dallas, TX, USA, November
4--10, 2000",
title = "{SC2000}: High Performance Networking and Computing.
Dallas Convention Center, Dallas, {TX}, {USA}, November
4--10, 2000",
publisher = pub-ACM # " and " # pub-IEEE,
address = pub-ACM:adr # " and " # pub-IEEE:adr,
pages = "????",
year = "2000",
ISBN = "",
ISBN-13 = "",
LCCN = "",
bibdate = "Thu Feb 24 09:35:00 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
URL = "http://www.sc2000.org/proceedings/info/fp.pdf",
acknowledgement = ack-nhfb,
editor = "Anonymous",
booktitle = "Cool Chips III: An International Symposium on
Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan,
Tokyo, Japan April 24--25, 2000",
title = "Cool Chips {III}: An International Symposium on
Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan,
Tokyo, Japan April 24--25, 2000",
publisher = "????",
address = "????",
pages = "????",
year = "2000",
ISBN = "",
ISBN-13 = "",
LCCN = "",
bibdate = "Mon Jan 08 09:19:21 2001",
bibsource = "http://www.coolchips.org/index-cool3.html;
acknowledgement = ack-nhfb,
editor = "Alice E. Koniges",
booktitle = "Industrial Strength Parallel Computing",
title = "Industrial Strength Parallel Computing",
publisher = pub-MORGAN-KAUFMANN,
address = pub-MORGAN-KAUFMANN:adr,
pages = "xxv + 597",
year = "2000",
ISBN = "1-55860-540-1",
ISBN-13 = "978-1-55860-540-4",
LCCN = "QA76.58 .I483 2000",
bibdate = "Fri Feb 04 18:30:40 2000",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
editor = "{USENIX}",
booktitle = "Proceedings of the 7th USENIX Tcl\slash Tk Conference
(Tcl/2k): February 14--18, 2000, Austin, Texas, USA",
title = "Proceedings of the 7th {USENIX} Tcl\slash Tk
Conference (Tcl/2k): February 14--18, 2000, Austin,
Texas, {USA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "194",
year = "2000",
ISBN = "1-880446-24-3",
ISBN-13 = "978-1-880446-24-9",
LCCN = "????",
bibdate = "Wed Oct 16 09:54:12 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/",
acknowledgement = ack-nhfb,
editor = "{USENIX}",
booktitle = "2000 USENIX Annual Technical Conference: San Diego,
CA, USA, June 18--23, 2000",
title = "2000 {USENIX} Annual Technical Conference: San Diego,
{CA}, {USA}, June 18--23, 2000",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "350",
year = "2000",
ISBN = "1-880446-22-7",
ISBN-13 = "978-1-880446-22-5",
LCCN = "????",
bibdate = "Mon Oct 14 07:43:52 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.usenix.org/publications/library/proceedings/usenix2000",
acknowledgement = ack-nhfb,
editor = "{ACM}",
booktitle = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE
Conference: Palo Alto, Calif., June 2--4, 2001}",
title = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE
Conference: Palo Alto, Calif., June 2--4, 2001}",
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "vi + 186",
year = "2001",
ISBN = "1-58113-359-6",
ISBN-13 = "978-1-58113-359-2",
LCCN = "QA76.9.O35 A26 2001",
bibdate = "Mon May 6 06:26:30 MDT 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
keywords = "Java (computer program language) -- congresses;
object-oriented methods (computer science) --
editor = "Ronald F. Boisvert and Ping Tak Peter Tang",
booktitle = "The architecture of scientific software: {IFIP
TC2/WG2.5 Working Conference on the Architecture of
Scientific Software, October 2--4, 2000, Ottawa,
title = "The architecture of scientific software: {IFIP
TC2/WG2.5 Working Conference on the Architecture of
Scientific Software, October 2--4, 2000, Ottawa,
volume = "60",
publisher = pub-KLUWER,
address = pub-KLUWER:adr,
pages = "xx + 358",
year = "2001",
ISBN = "0-7923-7339-1",
ISBN-13 = "978-0-7923-7339-1",
LCCN = "QA76.758 .I345 2000",
bibdate = "Fri May 27 08:46:38 2005",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
series = "IFIP",
acknowledgement = ack-nhfb,
tableofcontents = "Preface (p. ix)\\
Contributing Authors (p. xv)\\
Part I: Large-Scale Systems Integration\\
Network-Based Scientific Computing: Elias N. Houstis,
Ann Christine Catlin, Ganesh Balakrishnan, Nitesh
Dhanjani, GaHyun Park, John R. Rice, Spyros Lalis,
Manolis Stamatogiannakis, Catherine E. Houstis (pp.
3--28) \\
Future Generations of Problem-Solving Environments:
Jos{\'e} C. Cunha (pp. 29--38) \\
Developing an Architecture to Support the
Implementation and Development of Scientific computing
Applications: Dorian C. Arnold, Jack Dongarra (pp.
39--56) \\
PETSc and Overture: Lessons Learned Developing an
Interface between Components: Kristopher R. Buschelman,
William Gropp, Lois C. McInnes, Barry F. Smith (pp.
57--68) \\
Component Technology for High-Performance Scientific
Simulation Software: Tom Epperly, Scott R. Kohn, Gary
Kumfert (pp. 69--86) \\
A New Approach to Software Integration Frameworks for
Multi-physics Simulation Codes: Eric de Sturler, Jay
Hoeflinger, Laxmikant V. Kal{\'e}, Milind Bhandarkar
(pp. 87--104) \\
Code Coupling using Parallel CORBA Objects: Christophe
Ren{\'e}, Thierry Priol, Guillaume All{\'e}on (pp.
105--118) \\
A Collaborative Code Development Environment for
Computational Electro-magnetics: Matthew S. Shields,
Omer F. Rana, David W. Walker, David Colby (pp.
119--144) \\
Part II: The Architecture of Components\\
On the Role of Mathematical Abstractions for Scientific
Computing: Krister {\AA}hlander, Magne Haveraaen, Hans
Z. Munthe-Kaas (pp. 145--158) \\
Object-oriented Modeling of Parallel PDE Solvers:
Michael Thun{\'e}, Krister {\AA}hlander, Malin
Ljungberg, Markus Nord{\'e}n, Kurt Otto, Jarmo
Rantakokko (pp. 159--174) \\
Broadway: A Software Architecture for Scientific
Computing: Samuel Z. Guyer, Calvin Lin (pp. 175--192)
Formal Methods for High-Performance Linear Algebra
Libraries: John A. Gunnels, Robert A. van de Geijn (pp.
193--210) \\
New Generalized Matrix Data Structures Lead to a
Variety of High-Performance Algorithms: Fred G.
Gustavson (pp. 211--234) \\
A Comprehensive DFT API for Scientific Computing: Ping
Tak Peter Tang (pp. 235--256) \\
Using A Fortran Interface to POSIX Threads: Richard J.
Hanson, Clay P. Breshears, Henry A. Gabb (pp. 257--272)
Data Management Systems for Scientific Applications:
Reagan Moore (pp. 273--284) \\
Software Components for Application Development: Arnaud
Desitter, Antoine Le Hyaric, Geoff Morgan, Gareth Shaw,
Anne E. Trefethen (pp. 285--300) \\
Hierarchical Representation and Computation of
Approximate Solutions in Scientific Simulations: Wayne
H. Enright (pp. 301--316) \\
Software Architecture for the Investigation of
Controllable Models with Complex Data Sets: Dmitry
Belyshev, Vladimir I. Gurman (pp. 317--332) \\
A Mixed-Language Programming Methodology for High
Performance Java Computing: Vladimir Getov (pp.
333--350) \\
Part III: Conference Information\\
The Architecture of Scientific Software: the Conference
(pp. 351--356)\\
Index (pp. 357--358)",
editor = "Rudolf Eigenmann and Michael J. Voss",
booktitle = "{OpenMP} shared memory parallel programming:
International Workshop on {OpenMP} Applications and
Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July
30--31, 2001: proceedings",
title = "{OpenMP} shared memory parallel programming:
International Workshop on {OpenMP} Applications and
Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July
30--31, 2001: proceedings",
volume = "2104",
publisher = pub-SV,
address = pub-SV:adr,
pages = "x + 184",
year = "2001",
ISBN = "3-540-42346-X (paperback)",
ISBN-13 = "978-3-540-42346-1 (paperback)",
LCCN = "QA76.642 .I589 2001; QA267.A1 L43 no.2104",
bibdate = "Thu Jan 17 11:49:19 MST 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
series = ser-LNCS,
URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm",
acknowledgement = ack-nhfb,
keywords = "parallel programming (computer science) --
editor = "USENIX",
booktitle = "Proceedings of the Java Virtual Machine Research and
Technology Sy[m]posium (JVM '01): April 23--24, 2001,
Monterey, California, USA. Berkeley, CA",
title = "Proceedings of the Java Virtual Machine Research and
Technology Sy[m]posium ({JVM} '01): April 23--24, 2001,
Monterey, California, {USA}. Berkeley, {CA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "232",
year = "2001",
ISBN = "1-880446-11-1",
ISBN-13 = "978-1-880446-11-9",
LCCN = "QA76.73.J38 J42 2001",
bibdate = "Tue Oct 15 12:35:06 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.usenix.org/publications/library/proceedings/jvm01/",
acknowledgement = ack-nhfb,
editor = "{IEEE}",
booktitle = "{SC2002}: From Terabytes to Insight. Proceedings of
the {IEEE ACM SC 2002 Conference, November 16--22,
2002, Baltimore, MD, USA}",
title = "{SC2002}: From Terabytes to Insight. Proceedings of
the {IEEE ACM SC 2002 Conference, November 16--22,
2002, Baltimore, MD, USA}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
pages = "????",
year = "2002",
ISBN = "0-7695-1524-X",
ISBN-13 = "978-0-7695-1524-3",
LCCN = "????",
bibdate = "Thu Feb 21 18:29:36 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
editor = "{USENIX}",
booktitle = "Proceedings of BSDCon 2002: February 11--14, 2002,
Cathedral Hill Hotel, San Francisco, CA",
title = "Proceedings of {BSDCon} 2002: February 11--14, 2002,
Cathedral Hill Hotel, San Francisco, {CA}",
publisher = pub-USENIX,
address = pub-USENIX:adr,
pages = "viii + 151",
year = "2002",
ISBN = "1-880446-02-2",
ISBN-13 = "978-1-880446-02-7",
LCCN = "QA76.76.O63 B736 2002",
bibdate = "Tue Oct 15 12:45:29 2002",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
URL = "http://www.usenix.org/publications/library/proceedings/bsdcon02/tech.html",
acknowledgement = ack-nhfb,
editor = "Allyn Romanow and Jeff Mogul",
booktitle = "{Proceedings of the ACM SIGCOMM Workshop on
Network-I/O Convergence: experience, Lessons,
Implications 2003, Karlsruhe, Germany, August 25--27,
title = "{Proceedings of the ACM SIGCOMM Workshop on
Network-I/O Convergence: experience, Lessons,
Implications 2003, Karlsruhe, Germany, August 25--27,
publisher = pub-ACM,
address = pub-ACM:adr,
pages = "????",
year = "2003",
ISBN = "????",
ISBN-13 = "????",
LCCN = "TK5105.5",
bibdate = "Sat Oct 14 14:04:48 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
note = "ACM order number 534032.",
acknowledgement = ack-nhfb,
editor = "{ACM}",
booktitle = "SC2003: Igniting Innovation. {Phoenix, AZ, November
15--21, 2003}",
title = "{SC2003}: Igniting Innovation. {Phoenix, AZ, November
15--21, 2003}",
publisher = pub-ACM # " and " # pub-IEEE,
address = pub-ACM:adr # " and " # pub-IEEE:adr,
pages = "????",
year = "2003",
ISBN = "1-58113-695-1",
ISBN-13 = "978-1-58113-695-1",
LCCN = "????",
bibdate = "Thu Feb 21 18:29:36 2003",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
acknowledgement = ack-nhfb,
editor = "Anonymous",
booktitle = "Cool Chips VI:An International Symposium on Low-Power
and High-Speed Chips, Yokohama Joho Bunka Center,
Yokohama, Japan (Yokohama Media \& Communications
Center, Yokohama, Japan) April 16--18, 2003",
title = "Cool Chips {VI}:An International Symposium on
Low-Power and High-Speed Chips, Yokohama Joho Bunka
Center, Yokohama, Japan (Yokohama Media \&
Communications Center, Yokohama, Japan) April 16--18,
publisher = "????",
address = "????",
pages = "????",
year = "2003",
ISBN = "????",
ISBN-13 = "????",
LCCN = "????",
bibdate = "Fri Jan 09 16:53:37 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cool-chips.bib;
acknowledgement = ack-nhfb,
editor = "Barbara M. Chapman",
booktitle = "{Shared memory parallel programming with OpenMP: 5th
International Workshop on OpenMP Applications and
Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
Revised selected papers}",
title = "{Shared memory parallel programming with OpenMP: 5th
International Workshop on OpenMP Applications and
Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004:
Revised selected papers}",
volume = "3349",
publisher = pub-SV,
address = pub-SV:adr,
pages = "x + 147",
year = "2005",
DOI = "https://doi.org/10.1007/b105895",
ISBN = "3-540-24560-X",
ISBN-13 = "978-3-540-24560-5",
ISSN = "0302-9743 (print), 1611-3349 (electronic)",
LCCN = "QA76 .A1 L42 NO.3349",
bibdate = "Thu Jun 2 07:26:02 MDT 2005",
bibsource = "clavis.ucalgary.ca:2200/UNICORN;
series = ser-LNCS,
URL = "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3349;
acknowledgement = ack-nhfb,
meetingname = "International Workshop on OpenMP Applications and
Tools (2004: Houston, Tex.)",
subject = "Parallel programming (Computer science); Congresses",
editor = "Scott Lathrop and Jim Costa and William Kramer",
booktitle = "{SC'11: Proceedings of 2011 International Conference
for High Performance Computing, Networking, Storage and
Analysis, Seattle, WA, November 12--18 2011}",
title = "{SC'11: Proceedings of 2011 International Conference
for High Performance Computing, Networking, Storage and
Analysis, Seattle, WA, November 12--18 2011}",
publisher = pub-ACM # " and " # pub-IEEE,
address = pub-ACM:adr # " and " # pub-IEEE:adr,
pages = "????",
year = "2011",
ISBN = "1-4503-0771-X",
ISBN-13 = "978-1-4503-0771-0",
LCCN = "????",
bibdate = "Fri Dec 16 11:11:35 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
acknowledgement = ack-nhfb,
xxeditor = "{ACM}",
editor = "Jeffrey Hollingsworth",
booktitle = "{SC '12: Proceedings of the International Conference
on High Performance Computing, Networking, Storage and
Analysis, Salt Lake Convention Center, Salt Lake City,
UT, USA, November 10--16, 2012}",
title = "{SC '12: Proceedings of the International Conference
on High Performance Computing, Networking, Storage and
Analysis, Salt Lake Convention Center, Salt Lake City,
UT, USA, November 10--16, 2012}",
publisher = pub-IEEE,
address = pub-IEEE:adr,
year = "2012",
ISBN = "1-4673-0804-8",
ISBN-13 = "978-1-4673-0804-5",
bibdate = "Thu Nov 15 07:35:55 2012",
bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
acknowledgement = ack-nhfb,