Giorgi, Roberto
Exploring Future Many-Core Architectures: The TERAFLUX Evaluation Framework Book Chapter
In: vol. Advances in Computers (ADV COMPUT), Elsevier, 2016, ISSN: 0065-2458.
@inbook{Giorgi2016c,
title = {Exploring Future Many-Core Architectures: The TERAFLUX Evaluation Framework},
author = {Roberto Giorgi},
doi = {DOI:10.1016/bs.adcom.2016.09.002},
issn = {0065-2458},
year = {2016},
date = {2016-10-01},
volume = {Advances in Computers (ADV COMPUT)},
publisher = {Elsevier},
abstract = {The design of new computer systems always requires a strong simulation effort in order to evaluate different design options. This is especially true if the system is to be produced at a date far in the future, such as in the case of TERAFLUX, a system aimed at containing something like 10¹² (1 TERA) transistors in a single package or a (multilayer) chip by 2020. At the basis of a TERAFLUX system, a dataflow execution model supports the execution of threads. In order to explore the design space, TERAFLUX provides an appropriate evaluation framework, at the scale of at least 1000 general purpose cores on a single chip. Predicting the performance of such a next-generation platform is not a trivial task. Today, no software-based tool exists that can provide cycle-level full-system simulation and faithfully predict the behavior of 1000 general-purpose cores, in an acceptable amount of time and with reasonable accuracy, while providing the flexibility of changing the execution model at the architectural level. A solid evaluation framework represents an important base for exploring future many cores. In this chapter, different options for simulating a 1000 general-purpose-core system are explored. Finally, we show the setup that successfully allowed us to evaluate our 1000 core target while running a full-system Linux operating system.},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Giorgi, Roberto; Bettin, Nicola; Gai, Paolo; Martorell, Xavier; Rizzo, Antonio
AXIOM: A Flexible Platform for the Smart Home Book Chapter
In: Keramidas, Georgios; Voros, Nikolaos; Hbner, Michael (Ed.): vol. Springer International Publishing, pp. 57-74, Springer International Publishing, Cham, 2016, ISBN: 978-3-319-42304-3.
@inbook{Giorgi2016b,
title = {AXIOM: A Flexible Platform for the Smart Home},
author = {Giorgi, Roberto and Bettin, Nicola and Gai, Paolo and Martorell, Xavier and Rizzo, Antonio},
editor = {Keramidas, Georgios and Voros, Nikolaos and Hbner, Michael},
url = {http://dx.doi.org/10.1007/978-3-319-42304-3_3},
doi = {10.1007/978-3-319-42304-3_3},
isbn = {978-3-319-42304-3},
year = {2016},
date = {2016-09-24},
journal = {Components and Services for IoT Platforms: Paving the Way for IoT Standards},
volume = {Springer International Publishing},
pages = {57-74},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {The AXIOM hardware/software platform aims at bringing easy programmability on top of a cluster of processors by using a fast interconnect and FPGA as a basis for building a scalable embedded system. The Smart Home is one of the key scenarios in which AXIOM could be useful for the Internet-of-Things (IoT). In Smart Homes, everything is linked to the flow of information that from the on the field devices needs to arrive to the cloud servers. The information sensed in the environment will not be transmitted as is to the higher layers, but is somehow interpreted to provide a synthetic light-weight representation of the environment. In such a scenario, it is then clear that there is a need for peripheral nodes as well as intermediate gateways which needs to be able to perform high-performance computational loads. AXIOM provides the possibility of designing a cluster of low-power/low-budget boards, which could be packed inside a high-performance embedded low-cost product. The AXIOM boards are heterogeneous, thus allowing for even greater diversity which is needed in those kind of IoT scenarios. The cluster itself can then be integrated inside the IoT architectures as computational-power node, which could be the center of a distributed intelligence near the edges of the IoT network.},
howpublished = {Springer International Publishing},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Llort, Germán; eras, Antonio Filgu; ménez-Gonzál ez, Daniel Ji; Servat, Harald; Teruel, Xavier; rcadal, Estanislao Me; z, Carlos Álvare; Giménez, Judit; ell, Xavier Martor; dé, Eduard Aygua; Labarta, Jesús
The Secrets of the Accelerators Unveiled: Tracing Heterogeneous Executions Through OMPT Proceedings
Springer International Publishing, vol. OpenMP: Memory, Devices and Tasks, 2016.
@proceedings{Llort2016,
title = {The Secrets of the Accelerators Unveiled: Tracing Heterogeneous Executions Through OMPT},
author = {Germán Llort and Antonio Filgu eras and Daniel Ji ménez-Gonzál ez and Harald Servat and Xavier Teruel and Estanislao Me rcadal and Carlos Álvare z and Judit Giménez and Xavier Martor ell and Eduard Aygua dé and Jesús Labarta},
url = {https://link.springer.com/chapter/10.1007/978-3-319-45550-1_16},
doi = {10.1007/97 8-3-319-45 550-1_16},
year = {2016},
date = {2016-09-21},
volume = {OpenMP: Memory, Devices and Tasks},
publisher = {Springer International Publishing},
abstract = {Heterogeneous systems are an important trend in the future of supercomputers, yet they can be hard to program and developers still lack powerful tools to gain understanding about how well their accelerated codes perform and how to improve them.
Having different types of hardware accelerators available, each with their own specific low-level APIs to program them, there is not yet a clear consensus on a standard way to retrieve information about the accelerator’s performance. To improve this scenario, OMPT is a novel performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows analysis tools to monitor the execution of parallel OpenMP applications by providing detailed information about the activity of the runtime through a standard API. For accelerated devices, OMPT also facilitates the exchange of performance information between the runtime and the analysis tool. We implement part of the OMPT specification that refers to the use of accelerators both in the Nanos++ parallel runtime system and the Extrae tracing framework, obtaining detailed performance information about the execution of the tasks issued to the accelerated devices to later conduct insightful analysis.
Our work extends previous efforts in the field to expose detailed information from the OpenMP and OmpSs runtimes, regarding the activity and performance of task-based parallel applications. In this paper, we focus on the evaluation of FPGA devices studying the performance of two common kernels in scientific algorithms: matrix multiplication and Cholesky decomposition. Furthermore, this development is seamlessly applicable for the analysis of GPGPU accelerators and Intel® Xeon PhiTM co-processors operating under the OmpSs programming model.},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Having different types of hardware accelerators available, each with their own specific low-level APIs to program them, there is not yet a clear consensus on a standard way to retrieve information about the accelerator’s performance. To improve this scenario, OMPT is a novel performance monitoring interface that is being considered for integration into the OpenMP standard. OMPT allows analysis tools to monitor the execution of parallel OpenMP applications by providing detailed information about the activity of the runtime through a standard API. For accelerated devices, OMPT also facilitates the exchange of performance information between the runtime and the analysis tool. We implement part of the OMPT specification that refers to the use of accelerators both in the Nanos++ parallel runtime system and the Extrae tracing framework, obtaining detailed performance information about the execution of the tasks issued to the accelerated devices to later conduct insightful analysis.
Our work extends previous efforts in the field to expose detailed information from the OpenMP and OmpSs runtimes, regarding the activity and performance of task-based parallel applications. In this paper, we focus on the evaluation of FPGA devices studying the performance of two common kernels in scientific algorithms: matrix multiplication and Cholesky decomposition. Furthermore, this development is seamlessly applicable for the analysis of GPGPU accelerators and Intel® Xeon PhiTM co-processors operating under the OmpSs programming model.
Mazumdar, Somnath; Ayguade, Eduard; Bettin, Nicola; Bueno, Javier; Ermini, Sara; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martinez, Alvarez; Martorell, Xavier; Montefoschi, Francesco; Oro, David; Pnevmatikatos, Dionisis; Rizzo, Antonio; Theodoropoulos, Dimitris; Giorgi, Roberto
AXIOM: A Hardware-Software Platform for Cyber Physical Systems Journal Article
In: pp. 539–546, 2016, ISBN: 978-1-50 90-2817- 7.
@article{Mazumdar2016,
title = {AXIOM: A Hardware-Software Platform for Cyber Physical Systems},
author = {Mazumdar, Somnath and Ayguade, Eduard and Bettin, Nicola and
Bueno, Javier and Ermini, Sara and Filgueras, Antonio and
Jimenez-Gonzalez, Daniel and Martinez, Alvarez and Martorell, Xavier and
Montefoschi, Francesco and Oro, David and Pnevmatikatos, Dionisis and
Rizzo, Antonio and Theodoropoulos, Dimitris and Giorgi, Roberto},
doi = {10.1109/DSD.2016.80},
isbn = {978-1-50 90-2817- 7},
year = {2016},
date = {2016-09-07},
pages = {539--546},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Theodoropoulos, Dimitris; Pnevmatikatos, Dionisis; Garzarella, Stefano; Gai, Paolo; Rizzo, Antonio; Giorgi, Roberto
AXIOM: enabling parallel processing in cyber-physical systems. Proceedings Article
In: International Conference on Field-Programmable Logic and Applications, 2016.
@inproceedings{Theodoropoulos2016,
title = {AXIOM: enabling parallel processing in cyber-physical systems.},
author = {Dimitris Theodoropoulos and Dionisis Pnevmatikatos and Stefano Garzarella and Paolo Gai and Antonio Rizzo and Roberto Giorgi},
url = {http://fplwrc2016.cit-ec.uni-bielefeld.de/files/Dionisios_Pnevmatikatos.pdf},
year = {2016},
date = {2016-09-01},
booktitle = {International Conference on Field-Programmable Logic and Applications},
abstract = {The AXIOM project focuses on developing an affordable CPS node that features general purpose capability coupled with reconfigurable resources. The nodes will be interconnected and a programming layer will turn them into a parallel processing system. The programming layer also makes easier the use of the reconfigurable resources for accelerators.
Harnessing the combined CPS resources enables a new level of ”edge” processing. We will focus on the interconnection and modularity aspects of the project, and present the current status and the challenges we are facing mainly in performance and efficiency.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Harnessing the combined CPS resources enables a new level of ”edge” processing. We will focus on the interconnection and modularity aspects of the project, and present the current status and the challenges we are facing mainly in performance and efficiency.
Alvarez, Carlos; Ayguade, Eduard; Bosch, Jaume; Bueno, Javier; Cherkashin, Artem; Filgueras, Antonio; Jiminez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Vidal, Miquel; Theodoropoulos, Dimitris; Pnevmatikatos, Dionisios N.; Catani, Davide; Oro, David; Fernandez, Carles; Segura, Carlos; Rodriguez, Javier; Hernando, Javier; Scordino, Claudio; Gai, Paolo; Passera, Pierluigi; Pomella, Alberto; Bettin, Nicola; Rizzo, Antonio; Giorgi, Roberto
The AXIOM Software Layers Journal Article
In: "ELSEVIER Microprocessors and Microsystems", 2016, ISSN: 0141-9331.
@article{Alvarez2016,
title = {The AXIOM Software Layers},
author = {Carlos Alvarez and Eduard Ayguade and Jaume Bosch and Javier Bueno and Artem Cherkashin and Antonio Filgueras and Daniel Jiminez-Gonzalez and Xavier Martorell and Nacho Navarro and Miquel Vidal and Dimitris Theodoropoulos and Dionisios N. Pnevmatikatos and Davide Catani and David Oro and Carles Fernandez and Carlos Segura and Javier Rodriguez and Javier Hernando and Claudio Scordino and Paolo Gai and Pierluigi Passera and Alberto Pomella and Nicola Bettin and Antonio Rizzo and Roberto Giorgi},
url = {http://www.sciencedirect.com/science/article/pii/S0141933116300850},
doi = {10.1016/j.micpro.2016.07.002},
issn = {0141-9331},
year = {2016},
date = {2016-07-09},
journal = {"ELSEVIER Microprocessors and Microsystems"},
abstract = {Abstract People and objects will soon share the same digital network for information exchange in a world named as the age of the cyber-physical systems. The general expectation is that people and systems will interact in real-time. This poses pressure onto systems design to support increasing demands on computational power, while keeping a low power envelop. Additionally, modular scaling and easy programmability are also important to ensure these systems to become widespread. The whole set of expectations impose scientific and technological challenges that need to be properly addressed. The AXIOM project (Agile, eXtensible, fast I/O Module) will research new hardware/software architectures for cyber-physical systems to meet such expectations. The technical approach aims at solving fundamental problems to enable easy programmability of heterogeneous multi-core multi-board systems. AXIOM proposes the use of the task-based OmpSs programming model, leveraging low-level communication interfaces provided by the hardware. Modular scalability will be possible thanks to a fast interconnect embedded into each module. To this aim, an innovative ARM and FPGA-based board will be designed},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto
Exploring Dataflow-based Thread Level Parallelism in Cyber-physical Systems Proceedings Article
In: pp. 295-300, ACM, New York, NY, USA, 2016, ISBN: 978-1-4503-4128-8.
@inproceedings{Giorgi16c.bib,
title = {Exploring Dataflow-based Thread Level Parallelism in Cyber-physical Systems},
author = {Giorgi, Roberto},
url = {http://doi.acm.org/10.1145/2903150.2906829},
doi = {10.1145/2903150.2906829},
isbn = {978-1-4503-4128-8},
year = {2016},
date = {2016-05-16},
pages = {295-300},
publisher = {ACM},
address = {New York, NY, USA},
series = {CF '16},
abstract = {Smart Cyber-Physical Systems (SCPS) aim not only at integrating computational platforms and physical processes, but also at creating larger "systems of systems" capable of satisfying multiple critical constraints such as energy efficiency, high-performance, safety, security, size and cost.
The AXIOM project aims at designing such systems by focusing on low-cost Single Board Computers (SBC), based on current System-on-Chips (SoC) that include both programmable logic (FPGA), multi-core CPUs, accelerators and peripherals. A dataflow execution model, partially developed in the TERAFLUX project, brings a more predictable and reliable execution.
The goals of AXIOM include: i) the possibility to easily program the system with a shared-memory model based on OmpSs; ii) the possibility of scaling up the system through a custom but inexpensive interconnect; iii) the possibility of accelerating a specific function on a single or multiple FPGAs of the system.
The dataflow execution model operates at thread-level granularity. In this paper the AXIOM execution model and the related memory memory model is further detailed. The memory model is key for the execution of threads while reducing the need of data transfers. The preliminary results confirm the scalability of this model},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
The AXIOM project aims at designing such systems by focusing on low-cost Single Board Computers (SBC), based on current System-on-Chips (SoC) that include both programmable logic (FPGA), multi-core CPUs, accelerators and peripherals. A dataflow execution model, partially developed in the TERAFLUX project, brings a more predictable and reliable execution.
The goals of AXIOM include: i) the possibility to easily program the system with a shared-memory model based on OmpSs; ii) the possibility of scaling up the system through a custom but inexpensive interconnect; iii) the possibility of accelerating a specific function on a single or multiple FPGAs of the system.
The dataflow execution model operates at thread-level granularity. In this paper the AXIOM execution model and the related memory memory model is further detailed. The memory model is key for the execution of threads while reducing the need of data transfers. The preliminary results confirm the scalability of this model
Scordino, Claudio; Morelli, Bruno
Sharing memory in modern distributed applications Proceedings
2016, ISBN: 978-1-4503-3739-7.
@proceedings{Scordino2016,
title = {Sharing memory in modern distributed applications},
author = {Claudio Scordino and Bruno Morelli},
doi = {DOI:10.1145/2851613.2851950},
isbn = {978-1-4503-3739-7},
year = {2016},
date = {2016-04-04},
booktitle = {31st Annual ACM Symposium on Applied Computing},
pages = {1918-1921 },
abstract = {Traditionally, research on software distributed shared memory has been focused on optimizing the memory consistency models and the coherence protocols rather than the underlying run-time mechanisms. Thus, these systems have been often implemented on top of the paging functionalities offered by the operating system. This approach, however, introduces performance issues due to false sharing of data.
We propose an object-based approach that leverages the features of modern object-oriented programming to intercept single operations on data, hiding the underlying run-time mechanism. A possible implementation using the standard C++ programming language is shown and discussed.},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
We propose an object-based approach that leverages the features of modern object-oriented programming to intercept single operations on data, hiding the underlying run-time mechanism. A possible implementation using the standard C++ programming language is shown and discussed.
Verdoscia, Lorenzo; Giorgi, Roberto
A Data-Flow Soft-Core Processor for Accelerating Scientific Calculation on FPGAs Journal Article
In: Mathematical Problems in Engineering, vol. 2016, no. 1, pp. 1-21, 2016, ISSN: 1563-5147.
@article{Verdoscia2016,
title = {A Data-Flow Soft-Core Processor for Accelerating Scientific Calculation on FPGAs},
author = {Verdoscia, Lorenzo and Giorgi, Roberto},
url = {http://www.hindawi.com/journals/mpe/2016/3190234/cta/},
doi = {10.1155/2016/3190234},
issn = {1563-5147},
year = {2016},
date = {2016-04-01},
journal = {Mathematical Problems in Engineering},
volume = {2016},
number = {1},
pages = {1-21},
abstract = {We present a new type of soft-core processor called the “Data-Flow Soft-Core” that can be implemented through FPGA technology with adequate interconnect resources. This processor provides data processing based on data-flow instructions rather than control flow instructions. As a result, during an execution on the accelerator of the Data-Flow Soft-Core, both partial data and instructions are eliminated as traffic for load and store activities. Data-flow instructions serve to describe a program and to dynamically change the context of a data-flow program graph inside the accelerator, on-the-fly. Our proposed design aims at combining the performance of a fine-grained data-flow architecture with the flexibility of reconfiguration, without requiring a partial reconfiguration or new bit-stream for reprogramming it. The potential of the data-flow implementation of a function or functional program can be exploited simply by relying on its description through the data-flow instructions that reprogram the Data-Flow Soft-Core. Moreover, the data streaming process will mirror those present in other FPGA applications. Finally, we show the advantages of this approach by presenting two test cases and providing the quantitative and numerical results of our evaluations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Burgio, Paolo; Alvarez, Carlos; Ayguadé, Eduard; Filgueras, Antonio; Jiménez-González, Daniel; Martorell, Xavier; Navarro, Nacho; Giorgi, Roberto
Simulating next-generation Cyber-physical computing platforms Journal Article
In: Ada User Journal, vol. 37, no. 1, pp. 59-63, 2016, ISSN: 1381-6551, (TO APPEAR).
@article{Burgio2016,
title = {Simulating next-generation Cyber-physical computing platforms},
author = {Paolo Burgio and Carlos Alvarez and Eduard Ayguad\'{e} and Antonio Filgueras and Daniel Jim\'{e}nez-Gonz\'{a}lez and Xavier Martorell and Nacho Navarro and Roberto Giorgi},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84974555745\&partnerID=40\&md5=934acb4936e9317c382f4ce6fdce40f1},
issn = {1381-6551},
year = {2016},
date = {2016-03-01},
journal = {Ada User Journal},
volume = {37},
number = {1},
pages = {59-63},
abstract = {In specific domains, such as cyber-physical systems, platforms are quickly evolving to include multiple (many-) cores and programmable logic in a single system-on-chip, while including interfaces to commodity sensors/actuators. Programmable
Logic (e.g., FPGA) allows for greater flexibility and dependability. However, the task of extracting the performance/watt potential of heterogeneous many-cores is often demanded at the application level, and this has strong implication on the HW/SW co-design process. Enabling fast prototyping of a board being designed is paramount to enable low time-to-market for applications running on it, and ultimately, for the whole platform: programmers must be provided with accurate hardware models, to support the software development cycle at the very early stages of the design process. Virtual platforms fulfill this need, providing that they can be in turn efficiently developed and tested in a few months timespan. In this position paper we will share our experience in the sphere of the AXIOM project, identifying key properties that virtual platforms modeling next-generation cyber-physical systems should have to quickly enable simulation-based software development for a these platforms. },
note = {TO APPEAR},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Logic (e.g., FPGA) allows for greater flexibility and dependability. However, the task of extracting the performance/watt potential of heterogeneous many-cores is often demanded at the application level, and this has strong implication on the HW/SW co-design process. Enabling fast prototyping of a board being designed is paramount to enable low time-to-market for applications running on it, and ultimately, for the whole platform: programmers must be provided with accurate hardware models, to support the software development cycle at the very early stages of the design process. Virtual platforms fulfill this need, providing that they can be in turn efficiently developed and tested in a few months timespan. In this position paper we will share our experience in the sphere of the AXIOM project, identifying key properties that virtual platforms modeling next-generation cyber-physical systems should have to quickly enable simulation-based software development for a these platforms.
Mazumdar, Somnath; Giorgi, Roberto
A Survey on Hardware and Software Support for Thread Level Parallelism Journal Article
In: 2016.
@article{Mazumdar2016b,
title = {A Survey on Hardware and Software Support for Thread Level Parallelism},
author = {Somnath Mazumdar and Roberto Giorgi},
url = {https://arxiv.org/abs/1603.09274},
year = {2016},
date = {2016-03-01},
abstract = {To support growing massive parallelism, functional components and also the capabilities of current processors are changing and continue to do so. Todays computers are built upon multiple processing cores and run applications consisting of a large number of threads, making runtime thread management a complex process. Further, each core can support multiple, concurrent thread execution. Hence, hardware and software support for threads is more and more needed to improve peak-performance capacity, overall system throughput, and has therefore been the subject of much research. This paper surveys, many of the proposed or currently available solutions for executing, distributing and managing threads both in hardware and software. The nature of current applications is diverse. To increase the system performance, all programming models may not be suitable to harness the built-in massive parallelism of multicore processors. Due to the heterogeneity in hardware, hybrid programming model (which combines the features of shared and distributed model) currently has become very promising. In this paper, first, we have given an overview of threads, threading mechanisms and its management issues during execution. Next, we discuss about different parallel programming models considering to their explicit thread support. We also review the programming models with respect to their support to shared-memory, distributed-memory and heterogeneity. Hardware support at execution time is very crucial to the performance of the system, thus different types of hardware support for threads also exist or have been proposed, primarily based on widely used programming models. We also further discuss on software support for threads, to mainly increase the deterministic behavior during runtime. Finally, we conclude the paper by discussing some common issues related to the thread management.
A Survey on Hardware and Software Support for Thread Level Parallelism | Request PDF. Available from: https://www.researchgate.net/publication/301879025_A_Survey_on_Hardware_and_Software_Support_for_Thread_Level_Parallelism [accessed Feb 19 2018].},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
A Survey on Hardware and Software Support for Thread Level Parallelism | Request PDF. Available from: https://www.researchgate.net/publication/301879025_A_Survey_on_Hardware_and_Software_Support_for_Thread_Level_Parallelism [accessed Feb 19 2018].
Giorgi, R.; Scionti, A.
A scalable thread scheduling co-processor based on data-flow principles Journal Article
In: vol. 53, pp. pp. 100–108, 2015, ISSN: 0167-739X.
@article{Giorgi15a.bib,
title = {A scalable thread scheduling co-processor based on data-flow principles},
author = {R. Giorgi and A. Scionti},
url = {http://www.sciencedirect.com/science/article/pii/S0167739X1400274X},
doi = {10.1016/j.future.2014.12.014},
issn = {0167-739X},
year = {2015},
date = {2015-12-01},
volume = {53},
pages = {pp. 100\textendash108},
abstract = {Large synchronization and communication overhead will become a major concern in future extreme-scale machines (e.g., {HPC} systems, supercomputers). These systems will push upwards performance limits by adopting chips equipped with one order of magnitude more cores than today. Alternative execution models can be explored in order to exploit the high parallelism offered by future massive many-core chips. This paper proposes the integration of standard cores with dedicated co-processing units that enable the system to support a fine-grain data-flow execution model developed within the {TERAFLUX} project. An instruction set architecture extension for supporting fine-grain thread scheduling and execution is proposed. This instruction set extension is supported by the co-processor that provides hardware units for accelerating thread scheduling and distribution among the available cores. Two fundamental aspects are at the base of the proposed system: the programmers can adopt their preferred programming model, and the compilation tools can produce a large set of threads mainly communicating in a producer\~{A}¢\^{a}‚¬\^{a}€\oeconsumer fashion, hence enabling data-flow execution. Experimental results demonstrate the feasibility of the proposed approach and its capability of scaling with the increasing number of cores.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Giorgi, Roberto
Scalable Embedded Systems: Towards the Convergence of High-Performance and Embedded Computing Proceedings Article
In: Proceedings of the 13th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing (EUC 2015), 2015.
@inproceedings{Giorgi15d,
title = {Scalable Embedded Systems: Towards the Convergence of High-Performance and Embedded Computing},
author = {Roberto Giorgi},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/EUC15.pdf},
year = {2015},
date = {2015-10-20},
booktitle = {Proceedings of the 13th IEEE/IFIP International Conference on Embedded and Ubiquitous Computing (EUC 2015)},
abstract = {Embedded System toolchains are highly customized for a specific System-on-Chip (SoC). When the application needs more performance, the designer is typically forced to adopt a new SoC and possibly another toolchain. The rationale for not scaling performance by using, e.g., two SoCs, is that maintining most of the operations on-chip may allow for higher energy efficiency. We are exploring the feasibility and trade-offs of designing and manufacturing a new Single Board Computer (SBC) that could serve flexibly for a number of current and future applications, by allowing scalability through clusters of SBCs while keeping the same programming model for the SBC. This board is based on FPGAs and embedded processors, and its key points are: i) a fast custom interconnect for board-to-board communication and ii) an easily programmable environment which would allow both the off-loading of code into accelerators (either soft-IP blocks or hard-IP blocks) and, at the same time, the distribution of computation across boards. A key challenge to successfully deploying this paradigm is to properly distribute the threads across several boards without the explicit intervention of the programmer. In this paper we describe how to dynamically and efficiently distribute the computational threads in symbiosis with an appropriate memory model to allow the system scalability, so that we can double the performance by simply connecting two boards without i) changing the basic hardware components (e.g., to a different System-On-Chip) and ii) changing the programming model to follow the vendor specific toolchain. Our approach is to reduce data movement across boards. Our initial experiments have confirmed the feasibility of our approach.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jimenez-Gonzalez, Daniel; Alvarez-Martinez, Carlos; Filgueras, Antonio; Martorell, Xavier; Langer, Jan; Noguera, Juanjo; Vissers, Kees
Coarse-Grain Performance Estimator for Heterogeneous Parallel Computing Architectures like Zynq All-Programmable SoC Journal Article
In: Second International Workshop on FPGAs for Software Programmers FSP 2015, vol. CoRR, 2015.
@article{Jimenez-Gonzalez2015,
title = {Coarse-Grain Performance Estimator for Heterogeneous Parallel Computing Architectures like Zynq All-Programmable SoC},
author = {Daniel Jimenez-Gonzalez and Carlos Alvarez-Martinez and Antonio Filgueras and Xavier Martorell and Jan Langer and Juanjo Noguera and Kees Vissers},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/Coarse-Grain-Performance-Estimator-for-2.pdf},
year = {2015},
date = {2015-08-27},
journal = {Second International Workshop on FPGAs for Software Programmers FSP 2015},
volume = {CoRR},
abstract = {Heterogeneous computing is emerging as a mandatory requirement for power-efficient system design. With this aim, modern heterogeneous platforms like Zynq All-Programmable SoC, that integrates ARM-based SMP and programmable logic, have been designed. However, those platforms introduce large design cycles consisting on hardware/software partitioning, decisions on granularity and number of hardware accelerators, hardware/software integration, bitstream generation, etc.
This paper presents a performance parallel heterogeneous estimation for systems where hardware/software co-design and run-time heterogeneous task scheduling are key. The results show that the programmer can quickly decide, based only on her/his OmpSs (OpenMP + extensions) application, which is the co-design that achieves nearly optimal heterogeneous parallel performance, based on the methodology presented and considering only synthesis estimation results. The methodology presented reduces the programmer co-design decision from hours to minutes and shows high potential on hardware/software heterogeneous parallel performance estimation on the Zynq All-Programmable SoC.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
This paper presents a performance parallel heterogeneous estimation for systems where hardware/software co-design and run-time heterogeneous task scheduling are key. The results show that the programmer can quickly decide, based only on her/his OmpSs (OpenMP + extensions) application, which is the co-design that achieves nearly optimal heterogeneous parallel performance, based on the methodology presented and considering only synthesis estimation results. The methodology presented reduces the programmer co-design decision from hours to minutes and shows high potential on hardware/software heterogeneous parallel performance estimation on the Zynq All-Programmable SoC.
Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Theodoropoulos, Dimitris; Pnevmatikatos, Dionisios; Catani, Davide; Scordino, Claudio; Gai, Paolo; Segura, Carlos; Fernandez, Carles; Oro, David; Rodriguez-Saeta, Javier; Passera, Pierluigi; Pomella, Alberto; Rizzo, Antonio; Giorgi, Roberto
The AXIOM Software Layers Journal Article
In: DSD 2015, 18th Euromicro Conference on Digital Systems Design (DSD), 2015.
@article{Alvarez2015,
title = {The AXIOM Software Layers},
author = {Carlos Alvarez and Eduard Ayguade and Javier Bueno and Antonio Filgueras and Daniel Jimenez-Gonzalez and Xavier Martorell and Nacho Navarro and Dimitris Theodoropoulos and Dionisios Pnevmatikatos and Davide Catani and Claudio Scordino and Paolo Gai and Carlos Segura and Carles Fernandez and David Oro and Javier Rodriguez-Saeta and Pierluigi Passera and Alberto Pomella and Antonio Rizzo and Roberto Giorgi},
doi = {0.1109/DSD.2015.52},
year = {2015},
date = {2015-08-26},
journal = {DSD 2015, 18th Euromicro Conference on Digital Systems Design (DSD)},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Mondelli, Andrea; Ho, Nam; Scionti, Alberto; Solinas, Marco; Portero, Antoni; Giorgi, Roberto
Dataflow Support in x86_64 Multicore Architectures through Small Hardware Extensions Conference
2015.
@conference{DBLP:conf/dsd/MondelliHSSPG15,
title = {Dataflow Support in x86_64 Multicore Architectures through Small Hardware Extensions},
author = {Andrea Mondelli and Nam Ho and Alberto Scionti and Marco Solinas and Antoni Portero and Roberto Giorgi},
url = {https://pdfs.semanticscholar.org/5ead/bbc3f37eb79e0251d1f99a0a4c9c1bb169c0.pdf},
doi = {10.1109/DSD.2015.62},
year = {2015},
date = {2015-08-26},
abstract = {The path towards future high performance computers requires architectures able to efficiently run ulti-threaded applications. In this context, dataflow-based execution models can improve the performance by limiting the synchronization overhead, thanks to a simple producer-consumer approach. This paper advocates the ISE of standard cores with a small hardware extension for efficiently scheduling the execution of threads on the basis of dataflow principles. A set of dedicated instructions allow the code to interact with the scheduler. Experimental results demonstrate that, the combination of dedicated scheduling units and a dataflow execution model improve the performance when compared with other techniques for code parallelization (e.g. OpenMP, Cilk).},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Theodoropoulos, Dimitris; Pnevmatikatos, Dionisis; Alvarez, Carlos; Ayguade, Eduard; Bueno, Javier; Filgueras, Antonio; Jimenez-Gonzalez, Daniel; Martorell, Xavier; Navarro, Nacho; Segura, Carlos; Fernandez, Carles; Oro, David; Saeta, Javier Rodriguez; Gai, Paolo; Rizzo, Antonio; Giorgi, Roberto
The AXIOM project (Agile, eXtensible, fast I/O Module) Journal Article
In: International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation - SAMOS XV 2015, 2015.
@article{Theodoropoulos2015,
title = {The AXIOM project (Agile, eXtensible, fast I/O Module)},
author = {Dimitris Theodoropoulos and Dionisis Pnevmatikatos and Carlos Alvarez and Eduard Ayguade and Javier Bueno and Antonio Filgueras and Daniel Jimenez-Gonzalez and Xavier Martorell and Nacho Navarro and Carlos Segura and Carles Fernandez and David Oro and Javier Rodriguez Saeta and Paolo Gai and Antonio Rizzo and Roberto Giorgi},
url = {http://samos-conference.com/Resources_Samos_Websites/Proceedings_Repository_SAMOS/2015/Files/SS0_03.pdf},
year = {2015},
date = {2015-07-21},
journal = {International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation - SAMOS XV 2015},
abstract = {The AXIOM project (Agile, eXtensible, fast I/O Module) aims at researching new software/hardware architectures for the future Cyber-Physical Systems (CPSs). These systems are expected to react in real-time, provide enough computational power for the assigned tasks, consume the least possible energy for such task (energy efficiency), scale up through modularity, allow for an easy programmability across performance scaling, and exploit at best existing standards at minimal costs. Current solutions for providing enough computational power are mainly based on multi- or many-core architectures. For example, some current research projects (such as ADEPT or PSOCRATES) are already investigating how to join efforts from the High-Performance Computing (HPC) and the Embedded Computing domains, which are both focused on high power efficiency, while GPUs and new Dataflow platforms such as Maxeler, or in general FPGAs, are claimed as the most energy efficient. We present the project’s initial approach, ideas and key concepts, and describe the AXIOM preliminary architecture. Our starting point uses power efficient multi-core nodes, such as ARM cores and FPGA accelerators on the same die, as in the Xilinx Zynq. We will work to provide an integrated environment that supports programmability of the parallel, interconnected nodes that form a CPS system, and evaluate our ideas using demanding test application scenarios.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Burresi, Giovanni; Giorgi, Roberto
A Field Experience for a Vehicle Recognition System using Magnetic Sensors Proceedings Article
In: IEEE MECO 2015, pp. 178-181, 2015, ISBN: 978-1-4799-8999-7.
@inproceedings{Burresi15a.bib,
title = {A Field Experience for a Vehicle Recognition System using Magnetic Sensors},
author = {Burresi, Giovanni and Giorgi, Roberto},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/A-Field-Experience-for-a-Vehicle-Recognition-System-1.pdf},
doi = {10.1109/MECO.2015.7181897},
isbn = {978-1-4799-8999-7},
year = {2015},
date = {2015-06-14},
booktitle = {IEEE MECO 2015},
pages = {178-181},
abstract = {This paper describes the development and testing of a vehicle recognition prototype based on magnetic sensors. The aim of this research is to design a low cost, low power consumption and simple hardware platform for vehicle recognition. The goal is to recognize four types of vehicles (car, bus, mini-bus or camper) as they run over a set of magnetic sensors. We describe all steps for correct vehicle presence detection, pattern pre-processing, speed and length detection using a combination of an empirical and an analytical method for signal alignment. We collected a set of data regarding this types of vehicles and explain how to differentiate them. Our classification tests reach a confidence factor greater than 91%.},
keywords={Hardware;Magnetic fields;Magnetic flux;Magnetic hysteresis;Magnetic sensors;Vehicles;classification;cyber-physical systems;magnetic sensors;traffic monitoring;vehicle recognition},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
keywords={Hardware;Magnetic fields;Magnetic flux;Magnetic hysteresis;Magnetic sensors;Vehicles;classification;cyber-physical systems;magnetic sensors;traffic monitoring;vehicle recognition
Verdoscia, Lorenzo; Vaccaro, Roberto; Giorgi, Roberto
A matrix multiplier case study for an evaluation of a configurable Dataflow-Machine Proceedings Article
In: ACM CF'15 - LP-EMS, pp. 1-6, 2015, ISBN: 978-1-4503-3358-0.
@inproceedings{Verdoscia15a.bib,
title = {A matrix multiplier case study for an evaluation of a configurable Dataflow-Machine},
author = {Lorenzo Verdoscia and Roberto Vaccaro and Roberto Giorgi},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/A-matrix-multiplier-case-study-for-an-evaluation-of-a-configurable-Dataflow-machine.pdf},
doi = {10.1145/2742854.2747287},
isbn = {978-1-4503-3358-0},
year = {2015},
date = {2015-05-18},
booktitle = {ACM CF'15 - LP-EMS},
pages = {1-6},
abstract = {Configurable computing has become a subject of a great deal of research given its potential to greatly accelerate a wide variety of applications that require high throughput. In this context, the dataflow approach is still promising to accelerate the kernel of applications in the field of HPC. That tanks to a computational dataflow engine able to execute dataflow program graphs directly in a custom hardware. On the other hand, evaluating radically different models of computation remains yet an open issue. In this paper we present as case study the matrix multiplication that constitutes the fundamental kernel of the linear algebra. The evaluation takes into account the execution of the matrix product both in non-pipelined and pipelined modes. Results obtained running the execution of the two modes on an FPGA-based demonstrator show the validity of the configurable Dataflow-Machine. Moreover, at the same throughput, the power consumption is expected to be lower than in clock-based systems.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Mondelli, Andrea; Ho, Nam; Scionti, Alberto; Solinas, Marco; Portero, Antoni; Giorgi, Roberto
Enhancing an x86_64 Multi-Core Architecture with Data-Flow Execution Support Proceedings Article
In: Article, ACM 2015 (Ed.): 2015, ISBN: 978-1-4503-3358-0.
@inproceedings{10.1145/2742854.2742896,
title = {Enhancing an x86_64 Multi-Core Architecture with Data-Flow Execution Support},
author = {Andrea Mondelli and Nam Ho and Alberto Scionti and Marco Solinas and Antoni Portero and Roberto Giorgi},
editor = {ACM 2015 Article},
url = {http://www.axiom-project.eu/wp-content/uploads/2016/03/Enhancing-an-x86_64-Multi-Core-Architecture-with-1.pdf},
doi = {10.1145/2742854.2742896},
isbn = {978-1-4503-3358-0},
year = {2015},
date = {2015-05-06},
abstract = {Future exascale machines will require multi/many-core architectures able to energyciently run multi-threaded applications.
Data-flow execution models have demonstrated to be capable of improving execution performance by limiting the synchronization overhead. This paper proposes to augment cores with a minimalistic set of hardware units and dedicated instructions that allow energyciently scheduling the execution of threads on the basis of data-flow principles. Experimental results show performance improvements of the system when compared with other techniques (e.g., OpenMP, Cilk).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Data-flow execution models have demonstrated to be capable of improving execution performance by limiting the synchronization overhead. This paper proposes to augment cores with a minimalistic set of hardware units and dedicated instructions that allow energyciently scheduling the execution of threads on the basis of data-flow principles. Experimental results show performance improvements of the system when compared with other techniques (e.g., OpenMP, Cilk).
Sorry, no publications matched your criteria.