2019 |
Andrzejewski, Witold; Boinski, Pawel Clique-based Maximal Co-location Pattern Mining Algorithm Technical Report Poznan University of Technology (RA-7/2019), 2019. @techreport{AB2019, title = {Clique-based Maximal Co-location Pattern Mining Algorithm}, author = {Witold Andrzejewski and Pawel Boinski }, year = {2019}, date = {2019-12-01}, number = {RA-7/2019}, institution = {Poznan University of Technology}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } |
2018 |
Andrzejewski, Witold; Boinski, Pawel Promising Directions of Research on Spatial and Spatiotemporal Co-Location Pattern Mining Technical Report Poznan University of Technology (RA-5/2018), 2018. @techreport{AB2018b, title = {Promising Directions of Research on Spatial and Spatiotemporal Co-Location Pattern Mining}, author = {Witold Andrzejewski and Pawel Boinski}, year = {2018}, date = {2018-12-01}, number = {RA-5/2018}, institution = {Poznan University of Technology}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } |
Andrzejewski, Witold; Boinski, Pawel Promising Directions of Research on Spatial and Spatiotemporal Co-Location Pattern Mining Technical Report Poznan University of Technology (RA-5/2018), 2018. @techreport{AB2018c, title = {Promising Directions of Research on Spatial and Spatiotemporal Co-Location Pattern Mining}, author = {Witold Andrzejewski and Pawel Boinski}, year = {2018}, date = {2018-12-01}, number = {RA-5/2018}, institution = {Poznan University of Technology}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } |
Andrzejewski, Witold; Boinski, Pawel Parallel Approach to Incremental Co-location Pattern Mining Journal Article Forthcoming Information Sciences, Forthcoming. @article{AB2018, title = {Parallel Approach to Incremental Co-location Pattern Mining}, author = {Witold Andrzejewski and Pawel Boinski}, url = {https://www.sciencedirect.com/science/article/pii/S0020025518307151}, doi = {https://doi.org/10.1016/j.ins.2018.09.016}, year = {2018}, date = {2018-09-17}, journal = {Information Sciences}, abstract = {Co-location pattern mining is an important area of spatial data mining. In many real word applications, new data is continuously arriving to the system and is stored in spatial databases. As co-location discovery is computationally demanding task, it is crucial to maintain co-location patterns for such dynamic databases without recalculating them from scratch. In this paper we present the first GPU-parallelized solution to this problem. Our contribution is threefold: 1) we present a modified version of EUCOLOC algorithm using the iCPI-tree method called iCPI-EUCOLOC, 2) we modify state-of-the-art MGPUCPM algorithm to implement iCPI-EUCOLOC algorithm on GPUs and 3) we present experimental results showing large performance improvements over the original MGPUCPM algorithm. Our solution allows to reduce user waiting times and is economically beneficial due to the reduced overall computation time.}, keywords = {}, pubstate = {forthcoming}, tppubtype = {article} } Co-location pattern mining is an important area of spatial data mining. In many real word applications, new data is continuously arriving to the system and is stored in spatial databases. As co-location discovery is computationally demanding task, it is crucial to maintain co-location patterns for such dynamic databases without recalculating them from scratch. In this paper we present the first GPU-parallelized solution to this problem. Our contribution is threefold: 1) we present a modified version of EUCOLOC algorithm using the iCPI-tree method called iCPI-EUCOLOC, 2) we modify state-of-the-art MGPUCPM algorithm to implement iCPI-EUCOLOC algorithm on GPUs and 3) we present experimental results showing large performance improvements over the original MGPUCPM algorithm. Our solution allows to reduce user waiting times and is economically beneficial due to the reduced overall computation time. |
Andrzejewski, Witold; Boinski, Pawel Efficient spatial co-location pattern mining on multiple GPUs Journal Article Expert Systems with Applications, 93 (Supplement C), pp. 465–483, 2018, ISBN: 0957-4174. @article{ANDRZEJEWSKI2018465, title = {Efficient spatial co-location pattern mining on multiple GPUs}, author = {Witold Andrzejewski and Pawel Boinski}, editor = {Binshan Lin}, url = {http://www.sciencedirect.com/science/article/pii/S0957417417306991 https://authors.elsevier.com/c/1V-ul3PiGT3tDl}, doi = {https://doi.org/10.1016/j.eswa.2017.10.025}, isbn = {0957-4174}, year = {2018}, date = {2018-03-01}, journal = {Expert Systems with Applications}, volume = {93}, number = {Supplement C}, pages = {465--483}, abstract = {Abstract In this paper, we investigate Co-location Pattern Mining (CPM) from big spatial datasets. CPM consists in searching for types of objects that are frequently located together in a spatial neighborhood. Knowledge about such patterns is very important in fields like biology, environmental sciences, epidemiology etc. However, CPM is computationally challenging, mainly due to the large number of pattern instances hidden in spatial data. In this work, we propose a new solution that can utilize the power of multiple GPUs to increase the performance of CPM. The proposed solution is also capable of coping with the GPU memory limits by dividing the work into multiple packages and compressing internal data structures. Experiments performed on large synthetic and real-world datasets prove that we can achieve an order of magnitude speedups in comparison to the efficient multithreaded CPU implementation. Our solution can greatly improve the performance of data analysis, using widely available and energy efficient graphics cards. As a result, CPM in large datasets is more viable for university researchers as well as smaller companies and organizations.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Abstract In this paper, we investigate Co-location Pattern Mining (CPM) from big spatial datasets. CPM consists in searching for types of objects that are frequently located together in a spatial neighborhood. Knowledge about such patterns is very important in fields like biology, environmental sciences, epidemiology etc. However, CPM is computationally challenging, mainly due to the large number of pattern instances hidden in spatial data. In this work, we propose a new solution that can utilize the power of multiple GPUs to increase the performance of CPM. The proposed solution is also capable of coping with the GPU memory limits by dividing the work into multiple packages and compressing internal data structures. Experiments performed on large synthetic and real-world datasets prove that we can achieve an order of magnitude speedups in comparison to the efficient multithreaded CPU implementation. Our solution can greatly improve the performance of data analysis, using widely available and energy efficient graphics cards. As a result, CPM in large datasets is more viable for university researchers as well as smaller companies and organizations. |
2017 |
Andrzejewski, Witold; Drozdowski, Maciej; Mu, Gang; Sun, Yong Chao Two-Echelon System Stochastic Optimization with R and CUDA Inproceedings Roman Wyrzykowski Jack Dongarra, Ewa Deelman Konrad Karczewski (Ed.): International Conference on Parallel Processing and Applied Mathematics, pp. 254-264, Springer, Cham, 2017. @inproceedings{ADMS2017, title = {Two-Echelon System Stochastic Optimization with R and CUDA}, author = {Witold Andrzejewski and Maciej Drozdowski and Gang Mu and Yong Chao Sun}, editor = {Roman Wyrzykowski, Jack Dongarra, Ewa Deelman, Konrad Karczewski}, year = {2017}, date = {2017-09-10}, booktitle = {International Conference on Parallel Processing and Applied Mathematics}, volume = {10777}, pages = {254-264}, publisher = {Springer, Cham}, series = {LNCS}, abstract = {Parallelizing of the supply chain simulator is considered in this paper. The simulator is a key element of the algorithm optimizing inventory levels and order sizes in a two- echelon logistic system. The mode of operation of the logistic system and the optimization problem are defined first. Then, the inventory optimization algorithm is introduced. Parallelization for CUDA platform is presented. Benchmarking of the parallelized code demonstrates high efficiency of the software hybrid.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Parallelizing of the supply chain simulator is considered in this paper. The simulator is a key element of the algorithm optimizing inventory levels and order sizes in a two- echelon logistic system. The mode of operation of the logistic system and the optimization problem are defined first. Then, the inventory optimization algorithm is introduced. Parallelization for CUDA platform is presented. Benchmarking of the parallelized code demonstrates high efficiency of the software hybrid. |
Andrzejewski, Witold; Boinski, Pawel Trajectory Co-location Patterns Technical Report Poznan University of Technology Piotrowo 2, 60-965 Poznan, Poland, (RA-5/17), 2017. @techreport{AB2017continuous, title = {Trajectory Co-location Patterns}, author = {Witold Andrzejewski and Pawel Boinski}, year = {2017}, date = {2017-07-15}, number = {RA-5/17}, address = {Piotrowo 2, 60-965 Poznan, Poland}, institution = {Poznan University of Technology}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } |
2016 |
Andrzejewski, Witold; Boiński, Paweł Technical Report RA 13/16: An Algorithm for Mining Maximal Spatiotemporal Co-location Patterns Technical Report Poznan University of Technology Piotrowo 2, 60-965 Poznan, Poland, (RA 13/16), 2016. @techreport{AB2016tr, title = {Technical Report RA 13/16: An Algorithm for Mining Maximal Spatiotemporal Co-location Patterns}, author = {Witold Andrzejewski and Paweł Boiński}, year = {2016}, date = {2016-12-01}, number = {RA 13/16}, address = {Piotrowo 2, 60-965 Poznan, Poland}, institution = {Poznan University of Technology}, abstract = {Abstract. Computation of spatiotemporal co-location patterns (MDCOPs) is very costly. In the original apriori-based approach it involves finding spatial co-location patterns in each time moment and removing non-frequent ones. Minor optimizations are possible based on candidate frequency estimation and elimination of unfrequently prevalent candidates early in the apriori process. Apriori approach however can be very slow for dense datasets with large co-locations. To solve this problem we propose an algorithm for mining maximal spatiotemporal co-location patterns which is not based on apriori. Performance of the new algorithm is tested in experiments and improvements over the state-of-the-art algorithm are shown.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Abstract. Computation of spatiotemporal co-location patterns (MDCOPs) is very costly. In the original apriori-based approach it involves finding spatial co-location patterns in each time moment and removing non-frequent ones. Minor optimizations are possible based on candidate frequency estimation and elimination of unfrequently prevalent candidates early in the apriori process. Apriori approach however can be very slow for dense datasets with large co-locations. To solve this problem we propose an algorithm for mining maximal spatiotemporal co-location patterns which is not based on apriori. Performance of the new algorithm is tested in experiments and improvements over the state-of-the-art algorithm are shown. |
Andrzejewski, Witold; Bębel, Bartosz; Kłosowski, Szymon; Łukaszewski, Bartosz; Wrembel, Robert; Bakkalian, Gastón Advances in Conceptual Modeling, ER 2016, 9975 , Lecture Notes in Computer Science International Conference on Conceptual Modeling Springer International Publishing, 2016, ISBN: 978-3-319-47716-9. @conference{ABKLWB16, title = {Searching for Patterns in Sequential Data: Functionality and Performance Assessment of Commercial and Open-Source Systems}, author = {Witold Andrzejewski and Bartosz Bębel and Szymon Kłosowski and Bartosz Łukaszewski and Robert Wrembel and Gastón Bakkalian}, editor = { Sebastian Link and Juan C. Trujillo }, url = {https://link.springer.com/chapter/10.1007/978-3-319-47717-6_8}, doi = {10.1007/978-3-319-47717-6_8}, isbn = {978-3-319-47716-9}, year = {2016}, date = {2016-10-14}, booktitle = {Advances in Conceptual Modeling, ER 2016}, volume = {9975}, pages = {91--101}, publisher = {Springer International Publishing}, organization = {International Conference on Conceptual Modeling}, series = {Lecture Notes in Computer Science}, abstract = {Ubiquitous devices and applications generate data that are naturally ordered by time. Thus elementary data items can form sequences. The most popular way of analyzing sequences is searching for patterns. To this end, sequential pattern discovery techniques were proposed in some research contributions and implemented in a few database systems, e.g., Oracle Database, Teradata Aster, Apache Hive. The goal of this work is to assess the functionality of the systems and to evaluate their performance with respect to pattern queries.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Ubiquitous devices and applications generate data that are naturally ordered by time. Thus elementary data items can form sequences. The most popular way of analyzing sequences is searching for patterns. To this end, sequential pattern discovery techniques were proposed in some research contributions and implemented in a few database systems, e.g., Oracle Database, Teradata Aster, Apache Hive. The goal of this work is to assess the functionality of the systems and to evaluate their performance with respect to pattern queries. |
Andrzejewski, Witold; Khouri, Selma Selected Topics on Systems Modeling and Natural Language Processing: Editorial Introduction to the Issue 7 of CSIMQ Journal Article Complex Systems Informatics and Modeling Quarterly, (7), pp. I-II, 2016, ISSN: 2255-9922. @article{AK16, title = {Selected Topics on Systems Modeling and Natural Language Processing: Editorial Introduction to the Issue 7 of CSIMQ}, author = {Witold Andrzejewski and Selma Khouri}, editor = {Witold Andrzejewski and Selma Khouri}, url = {https://csimq-journals.rtu.lv/article/download/csimq.2016-7.00/821}, doi = {10.7250/csimq.2016-7.00}, issn = {2255-9922}, year = {2016}, date = {2016-07-29}, journal = {Complex Systems Informatics and Modeling Quarterly}, number = {7}, pages = {I-II}, abstract = {The seventh issue of Complex Systems Informatics and Modeling Quarterly presents five papers devoted to two distinct research topics: systems modeling and natural language processing (NLP). Both of these subjects are very important in computer science. Through modeling we can simplify the studied problem by concentrating on only one aspect at a time. Moreover, a properly constructed model allows the modeler to work on higher levels of abstraction and not having to concentrate on details. Since the size and complexity of information systems grows rapidly, creating good models of such systems is crucial. The analysis of natural language is slowly becoming a widely used tool in commerce and day to day life. Opinion mining allows recommender systems to provide accurate recommendations based on user-generated reviews. Speech recognition and NLP are the basis for such widely used personal assistants as Apple’s Siri, Microsoft’s Cortana, and Google Now. While a lot of work has already been done on natural language processing, the research usually concerns widely used languages, such as English. Consequently, natural language processing in languages other than English is very relevant subject and is addressed in this issue.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The seventh issue of Complex Systems Informatics and Modeling Quarterly presents five papers devoted to two distinct research topics: systems modeling and natural language processing (NLP). Both of these subjects are very important in computer science. Through modeling we can simplify the studied problem by concentrating on only one aspect at a time. Moreover, a properly constructed model allows the modeler to work on higher levels of abstraction and not having to concentrate on details. Since the size and complexity of information systems grows rapidly, creating good models of such systems is crucial. The analysis of natural language is slowly becoming a widely used tool in commerce and day to day life. Opinion mining allows recommender systems to provide accurate recommendations based on user-generated reviews. Speech recognition and NLP are the basis for such widely used personal assistants as Apple’s Siri, Microsoft’s Cortana, and Google Now. While a lot of work has already been done on natural language processing, the research usually concerns widely used languages, such as English. Consequently, natural language processing in languages other than English is very relevant subject and is addressed in this issue. |
2015 |
Andrzejewski, Witold An index for non contiguous pattern template queries. Technical Report RA 4/15 Technical Report Poznan University of Technology (RA 4/15), 2015. @techreport{A2015, title = {An index for non contiguous pattern template queries. Technical Report RA 4/15}, author = {Witold Andrzejewski}, year = {2015}, date = {2015-12-07}, number = {RA 4/15}, institution = {Poznan University of Technology}, abstract = {This paper tackles the problem of efficient retrieval of sequences containing non contiguous pattern template instances. It defines the non contiguous pattern template query problem and proposes several index structures which can support such queries. It introduces and formally proves multiple lemmas and properties of the proposed index structure and provides an efficient algorithm for its construction. Also, results of several preliminary experiments testing its time of construction and properties of the index structure are presented. }, keywords = {}, pubstate = {published}, tppubtype = {techreport} } This paper tackles the problem of efficient retrieval of sequences containing non contiguous pattern template instances. It defines the non contiguous pattern template query problem and proposes several index structures which can support such queries. It introduces and formally proves multiple lemmas and properties of the proposed index structure and provides an efficient algorithm for its construction. Also, results of several preliminary experiments testing its time of construction and properties of the index structure are presented. |
Andrzejewski, Witold; Boinski, Pawel Parallel GPU-based Plane-sweep Algorithm for Construction of iCPI-trees Journal Article Journal of Database Management, 26 (3), pp. 1-20, 2015, ISSN: 1063-8016. @article{AB2016, title = {Parallel GPU-based Plane-sweep Algorithm for Construction of iCPI-trees}, author = {Witold Andrzejewski and Pawel Boinski}, editor = {Keng Siau}, url = {http://www.igi-global.com/article/parallel-gpu-based-plane-sweep-algorithm-for-construction-of-icpi-trees/145868}, doi = {10.4018/JDM.2015070101}, issn = {1063-8016}, year = {2015}, date = {2015-07-01}, journal = {Journal of Database Management}, volume = {26}, number = {3}, pages = {1-20}, abstract = {This article tackles the problem of efficient construction of iCPI trees, frequently used in co-location pattern discovery in spatial databases. It discusses the methods for parallelization of iCPI-tree construction and plane-sweep algorithms used in state-of-the-art algorithms for co-location pattern mining. The main contribution of this paper is threefold: (1) a general algorithm for parallel iCPI-tree construction is presented, (2) two variants of parallel plane-sweep algorithm (which can be used in conjunction with the aforementioned iCPI-tree construction algorithm) are introduced and (3) all three algorithms are implemented on CUDA GPU platform and their performance is tested against an efficient multithreaded parallel implementation of iCPI-tree construction on CPU. Experiments prove that our solutions allow for large speedups over CPU version of the algorithm. This paper is an extension of the conference paper (Andrzejewski & Boinski, 2014).}, keywords = {}, pubstate = {published}, tppubtype = {article} } This article tackles the problem of efficient construction of iCPI trees, frequently used in co-location pattern discovery in spatial databases. It discusses the methods for parallelization of iCPI-tree construction and plane-sweep algorithms used in state-of-the-art algorithms for co-location pattern mining. The main contribution of this paper is threefold: (1) a general algorithm for parallel iCPI-tree construction is presented, (2) two variants of parallel plane-sweep algorithm (which can be used in conjunction with the aforementioned iCPI-tree construction algorithm) are introduced and (3) all three algorithms are implemented on CUDA GPU platform and their performance is tested against an efficient multithreaded parallel implementation of iCPI-tree construction on CPU. Experiments prove that our solutions allow for large speedups over CPU version of the algorithm. This paper is an extension of the conference paper (Andrzejewski & Boinski, 2014). |
2014 |
Andrzejewski, Witold Indeks dla hierarchicznych zapytań o wzorce. Raport Techniczny RB-8/14 (in polish) Technical Report Poznan University of Technology 2014. @techreport{A2014, title = {Indeks dla hierarchicznych zapytań o wzorce. Raport Techniczny RB-8/14 (in polish)}, author = {Witold Andrzejewski}, year = {2014}, date = {2014-12-01}, institution = {Poznan University of Technology}, abstract = {Analiza danych sekwencyjnych w metodami OLAP (tzw. SOLAP) wymaga wsparcia ze strony systemu zarządzania bazą danych w celu zapewnienia wydajnej realizacji nowych typów zapytań analitycznych. Integralną częścią przetwarzania zapytań SOLAP jest odnajdowanie sekwencji, które są zgodne z zadanym wzorcem na zadanym poziomie agregacji. W niniejszej pracy przedstawiono kilka wariantów indeksu wspierającego ten typ zapytań. }, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Analiza danych sekwencyjnych w metodami OLAP (tzw. SOLAP) wymaga wsparcia ze strony systemu zarządzania bazą danych w celu zapewnienia wydajnej realizacji nowych typów zapytań analitycznych. Integralną częścią przetwarzania zapytań SOLAP jest odnajdowanie sekwencji, które są zgodne z zadanym wzorcem na zadanym poziomie agregacji. W niniejszej pracy przedstawiono kilka wariantów indeksu wspierającego ten typ zapytań. |
Andrzejewski, Witold; Boinski, Pawel ICPI Tree construction parallelization on Graphics Processing Units (Technical Report RA-012/2014) Technical Report Poznan University of Technology 2014. @techreport{AB2014b, title = {ICPI Tree construction parallelization on Graphics Processing Units (Technical Report RA-012/2014)}, author = {Witold Andrzejewski and Pawel Boinski}, year = {2014}, date = {2014-12-01}, institution = {Poznan University of Technology}, abstract = {Collocation pattern discovery in spatial databases is today one of the most promising and interesting applications of data mining techniques. Given a set of spatial objects, collocation pattern discovery algorithms search for sets the so-called collocation patterns, i.e. sets of types of objects which are frequently located together. Mining large spatial datasets requires a lot of computing power. To solve these problems, a GPU accelerated version of the collocation pattern mining algorithm has been proposed recently [3]. This solution however, requires a prebuilt supporting structure (the so-called iCPI-tree) storing information about neighborhoods. In this paper we present three iCPI-tree generation algorithms which complement the algorithm from [3]. Performance of our solutions is compared to a parallel implementation of iCPI-tree generation method for CPU. Experiments prove that our solutions allow for large speedups over CPU version of the algorithm.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Collocation pattern discovery in spatial databases is today one of the most promising and interesting applications of data mining techniques. Given a set of spatial objects, collocation pattern discovery algorithms search for sets the so-called collocation patterns, i.e. sets of types of objects which are frequently located together. Mining large spatial datasets requires a lot of computing power. To solve these problems, a GPU accelerated version of the collocation pattern mining algorithm has been proposed recently [3]. This solution however, requires a prebuilt supporting structure (the so-called iCPI-tree) storing information about neighborhoods. In this paper we present three iCPI-tree generation algorithms which complement the algorithm from [3]. Performance of our solutions is compared to a parallel implementation of iCPI-tree generation method for CPU. Experiments prove that our solutions allow for large speedups over CPU version of the algorithm. |
Andrzejewski, Witold; Boinski, Pawel A Parallel Algorithm for Building iCPI-trees Inproceedings Manolopoulos, Yannis; Trajcevski, Goce; Kon-Popovska, Margita (Ed.): Advances in Databases and Information Systems, pp. 276–289, Springer International Publishing, 2014, ISBN: 978-3-319-10932-9. @inproceedings{AB2014a, title = {A Parallel Algorithm for Building iCPI-trees}, author = {Witold Andrzejewski and Pawel Boinski}, editor = {Yannis Manolopoulos and Goce Trajcevski and Margita Kon-Popovska}, isbn = {978-3-319-10932-9}, year = {2014}, date = {2014-09-01}, booktitle = {Advances in Databases and Information Systems}, volume = {8716}, pages = {276--289}, publisher = {Springer International Publishing}, series = {Lecture Notes in Computer Science}, abstract = {In spatial databases collocation pattern discovery is one of the most interesting fields of data mining. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. With the advent of data gathering techniques, huge volumes of spatial data are being collected. To cope with processing of such datasets a GPU accelerated version of the collocation pattern mining algorithm has been proposed recently [3]. However, the method assumes that a supporting structure that contains information about neighborhoods (called iCPI-tree) is given in advance. In this paper we present a GPU-based version of iCPI-tree generation algorithm for the collocation pattern discovery problem. In an experimental evaluation we compare our GPU implementation with a parallel implementation of iCPI-tree generation method for CPU. Collected results show that proposed solution is multiple times faster than the CPU version of the algorithm.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In spatial databases collocation pattern discovery is one of the most interesting fields of data mining. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. With the advent of data gathering techniques, huge volumes of spatial data are being collected. To cope with processing of such datasets a GPU accelerated version of the collocation pattern mining algorithm has been proposed recently [3]. However, the method assumes that a supporting structure that contains information about neighborhoods (called iCPI-tree) is given in advance. In this paper we present a GPU-based version of iCPI-tree generation algorithm for the collocation pattern discovery problem. In an experimental evaluation we compare our GPU implementation with a parallel implementation of iCPI-tree generation method for CPU. Collected results show that proposed solution is multiple times faster than the CPU version of the algorithm. |
2013 |
Andrzejewski, Witold; Gramacki, Artur; Gramacki, Jarosław Graphics processing units in acceleration of bandwidth selection for kernel density estimation Journal Article International Journal of Applied Mathematics and Computer Science, 23 (4), pp. 869–886, 2013, ISBN: 1641-876X (print), 2083-8492 (online). @article{AGG2013, title = {Graphics processing units in acceleration of bandwidth selection for kernel density estimation}, author = {Witold Andrzejewski and Artur Gramacki and Jarosław Gramacki}, url = {https://amcs.uz.zgora.pl/?action=paper&paper=731}, isbn = { 1641-876X (print), 2083-8492 (online)}, year = {2013}, date = {2013-12-20}, journal = {International Journal of Applied Mathematics and Computer Science}, volume = {23}, number = {4}, pages = {869--886}, abstract = {The Probability Density Function (PDF) is a key concept in statistics. Constructing the most adequate PDF from the observed data is still an important and interesting scientific problem, especially for large datasets. PDFs are often estimated using nonparametric data-driven methods. One of the most popular nonparametric method is the Kernel Density Estimator (KDE). However, a very serious drawback of using KDEs is the large number of calculations required to compute them, especially to find the optimal bandwidth parameter. In this paper we investigate the possibility of utilizing Graphics Processing Units (GPUs) to accelerate the finding of the bandwidth. The contribution of this paper is threefold: (a) we propose algorithmic optimization to one of bandwidth finding algorithms, (b) we propose efficient GPU versions of three bandwidth finding algorithms and (c) we experimentally compare three of our GPU implementations with the ones which utilize only CPUs. Our experiments show orders of magnitude improvements over CPU implementations of classical algorithms.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The Probability Density Function (PDF) is a key concept in statistics. Constructing the most adequate PDF from the observed data is still an important and interesting scientific problem, especially for large datasets. PDFs are often estimated using nonparametric data-driven methods. One of the most popular nonparametric method is the Kernel Density Estimator (KDE). However, a very serious drawback of using KDEs is the large number of calculations required to compute them, especially to find the optimal bandwidth parameter. In this paper we investigate the possibility of utilizing Graphics Processing Units (GPUs) to accelerate the finding of the bandwidth. The contribution of this paper is threefold: (a) we propose algorithmic optimization to one of bandwidth finding algorithms, (b) we propose efficient GPU versions of three bandwidth finding algorithms and (c) we experimentally compare three of our GPU implementations with the ones which utilize only CPUs. Our experiments show orders of magnitude improvements over CPU implementations of classical algorithms. |
Andrzejewski, Witold; Boiński, Paweł Collocation Pattern Mining on GPUs Technical Report Poznan University of Technology Piotrowo 2, 60-965 Poznan, Poland, (RA-16/2013), 2013. @techreport{AB2013b, title = {Collocation Pattern Mining on GPUs}, author = {Witold Andrzejewski and Paweł Boiński}, year = {2013}, date = {2013-12-07}, number = {RA-16/2013}, address = {Piotrowo 2, 60-965 Poznan, Poland}, institution = {Poznan University of Technology}, abstract = {Abstract Collocation Pattern Discovery is field of data mining performed in spatial databases. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. Such patterns are useful in many application domains including, but not limited to, biology, geography, marketing and meteorology. To cope with processing of these huge volumes of data, programmable high-performance hardware is needed. For this purpose we propose to utilize graphics processing units (GPUs) of modern graphics cards. GPUs have been proven recently to be extremely efficient in accelerating many existing algorithms. In this paper we present, a new GPU-accelerated version of iCPI-tree based algorithm for the collocation discovery problem. The presented algorithm was carefully designed to be able to work in limited memory environments and still retain high performance. Our contribution in this paper is threefold: (1) we present a new GPU based algorithm which has a much smaller memory footprint than previous solutions, (2) we propose algorithms for seamless division of work into a set of tasks such that each of them does not exceed the amount of available GPU memory and can be performed in parallel on multiple GPUs, and (3) we have performed multiple, extensive experiments on large datasets (reaching hundreds of thousands of objects) and achieved an order of magnitude speedups in comparison to an efficient multithreaded CPU implementation.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Abstract Collocation Pattern Discovery is field of data mining performed in spatial databases. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. Such patterns are useful in many application domains including, but not limited to, biology, geography, marketing and meteorology. To cope with processing of these huge volumes of data, programmable high-performance hardware is needed. For this purpose we propose to utilize graphics processing units (GPUs) of modern graphics cards. GPUs have been proven recently to be extremely efficient in accelerating many existing algorithms. In this paper we present, a new GPU-accelerated version of iCPI-tree based algorithm for the collocation discovery problem. The presented algorithm was carefully designed to be able to work in limited memory environments and still retain high performance. Our contribution in this paper is threefold: (1) we present a new GPU based algorithm which has a much smaller memory footprint than previous solutions, (2) we propose algorithms for seamless division of work into a set of tasks such that each of them does not exceed the amount of available GPU memory and can be performed in parallel on multiple GPUs, and (3) we have performed multiple, extensive experiments on large datasets (reaching hundreds of thousands of objects) and achieved an order of magnitude speedups in comparison to an efficient multithreaded CPU implementation. |
Andrzejewski, Witold; Boiński, Paweł GPU-Accelerated Collocation Pattern Discovery Inproceedings Proceedings of Advances in Databases and Information Systems, pp. 302–315, Spinger-Verlag, 2013. @inproceedings{AB2013, title = {GPU-Accelerated Collocation Pattern Discovery}, author = {Witold Andrzejewski and Paweł Boiński}, url = {http://link.springer.com/chapter/10.1007%2F978-3-642-40683-6_23}, year = {2013}, date = {2013-09-01}, booktitle = {Proceedings of Advances in Databases and Information Systems}, journal = {Lecture Notes in Computer Science }, volume = {8133}, pages = {302--315}, publisher = {Spinger-Verlag}, series = {Lecture Notes in Computer Science}, abstract = {Collocation Pattern Discovery is a very interesting field of data mining in spatial databases. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. Application domains of such patterns include, but are not limited to, biology, geography, marketing and meteorology. To cope with processing of these huge volumes of data programmable high-performance graphic cards (GPU) can be used. GPUs have been proven recently to be extremely efficient in accelerating many existing algorithms. In this paper we present GPU-CM, a GPU-accelerated version of iCPI-tree based algorithm for the collocation discovery problem. To achieve the best performance we introduce specially designed structures and processing methods for the best utilization of the SIMD execution model. In experimental evaluation we compare our GPU implementation with a parallel implementation of iCPI-tree method for CPU. Collected results show order of magnitude speedups over the CPU version of the algorithm.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Collocation Pattern Discovery is a very interesting field of data mining in spatial databases. It consists in searching for types of spatial objects that are frequently located together in a spatial neighborhood. Application domains of such patterns include, but are not limited to, biology, geography, marketing and meteorology. To cope with processing of these huge volumes of data programmable high-performance graphic cards (GPU) can be used. GPUs have been proven recently to be extremely efficient in accelerating many existing algorithms. In this paper we present GPU-CM, a GPU-accelerated version of iCPI-tree based algorithm for the collocation discovery problem. To achieve the best performance we introduce specially designed structures and processing methods for the best utilization of the SIMD execution model. In experimental evaluation we compare our GPU implementation with a parallel implementation of iCPI-tree method for CPU. Collected results show order of magnitude speedups over the CPU version of the algorithm. |
Andrzejewski, Witold; Gramacki, Artur; Gramacki, Jarosław Density Estimations for Approximate Query Processing on SIMD Architectures Technical Report Poznan University Of Technology Piotrowo 2, 60-965 Poznan, Poland, (RA 03/13), 2013. @techreport{AGG2013a, title = {Density Estimations for Approximate Query Processing on SIMD Architectures}, author = {Witold Andrzejewski and Artur Gramacki and Jarosław Gramacki}, year = {2013}, date = {2013-07-01}, number = {RA 03/13}, address = {Piotrowo 2, 60-965 Poznan, Poland}, institution = {Poznan University Of Technology}, abstract = {Approximate query processing (AQP) is an interesting alternative for exact query processing. It is a tool for dealing with the huge data volumes where response time is more important than perfect accuracy (this is typically the case during initial phase of data exploration). There are many techniques for AQP, one of them is based on probability density functions (PDF). PDFs are typically calculated using nonparametric data-driven methods. One of the most popular nonparametric method is the kernel density estimator (KDE). However, a very serious drawback of using KDEs is the large number of calculations required to compute them. The shape of final density function is very sensitive to an entity called bandwidth or smoothing parameter. Calculating it’s optimal value is not a trivial task and in general is very time consuming. In this paper we investigate the possibility of utilizing two SIMD architectures: SSE CPU extensions and NVIDIA’s CUDA architecture to accelerate finding of the bandwidth. Our experiments show orders of magnitude improvements over a simple sequential implementation of classical algorithms used for that task. }, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Approximate query processing (AQP) is an interesting alternative for exact query processing. It is a tool for dealing with the huge data volumes where response time is more important than perfect accuracy (this is typically the case during initial phase of data exploration). There are many techniques for AQP, one of them is based on probability density functions (PDF). PDFs are typically calculated using nonparametric data-driven methods. One of the most popular nonparametric method is the kernel density estimator (KDE). However, a very serious drawback of using KDEs is the large number of calculations required to compute them. The shape of final density function is very sensitive to an entity called bandwidth or smoothing parameter. Calculating it’s optimal value is not a trivial task and in general is very time consuming. In this paper we investigate the possibility of utilizing two SIMD architectures: SSE CPU extensions and NVIDIA’s CUDA architecture to accelerate finding of the bandwidth. Our experiments show orders of magnitude improvements over a simple sequential implementation of classical algorithms used for that task. |
2012 |
Andrzejewski, Witold; Bębel, Bartosz FOCUS: An Index FOr ContinuoUS Subsequence Pattern Queries Inproceedings Morzy, Tadeusz; Haerder, Theo; Wrembel, Robert (Ed.): Proceedings of the 16th East European Conference Advances in Databases and Information Systems ADBIS 2012, Poznań, Poland, pp. 29–42, Springer-Verlag, 2012, ISBN: 0302-9743. @inproceedings{AB2012a, title = {FOCUS: An Index FOr ContinuoUS Subsequence Pattern Queries}, author = {Witold Andrzejewski and Bartosz Bębel}, editor = {Tadeusz Morzy and Theo Haerder and Robert Wrembel}, url = {http://link.springer.com/chapter/10.1007%2F978-3-642-33074-2_3}, isbn = {0302-9743}, year = {2012}, date = {2012-09-18}, booktitle = {Proceedings of the 16th East European Conference Advances in Databases and Information Systems ADBIS 2012, Poznań, Poland}, journal = {Lecture Notes in Computer Science}, number = {7503}, pages = {29--42}, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, abstract = {Recent appearance of the a type of OLAP analysis, the sequential OLAP (or SOLAP) has caused the need for new index structures which support new types of analytical queries. An integral part of processing SOLAP queries is finding sequences which match a user-specified pattern. We call such queries \"subsequence pattern queries\". The contribution of this paper is threefold: first, we propose logical and physical index structure which supports subsequence pattern queries, second, we extend this structure to support aggregation queries and third, we perform performance experiments which show that our solutions offer orders of magnitude improvement over previous state of the art solutions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Recent appearance of the a type of OLAP analysis, the sequential OLAP (or SOLAP) has caused the need for new index structures which support new types of analytical queries. An integral part of processing SOLAP queries is finding sequences which match a user-specified pattern. We call such queries "subsequence pattern queries". The contribution of this paper is threefold: first, we propose logical and physical index structure which supports subsequence pattern queries, second, we extend this structure to support aggregation queries and third, we perform performance experiments which show that our solutions offer orders of magnitude improvement over previous state of the art solutions. |
Andrzejewski, Witold; Bębel, Bartosz FOCUS: An Index FOr ContinuoUS Subsequence Pattern Queries Technical Report Poznan University Of Technology Piotrowo 2, 60-965 Poznan, Poland, (RA 04/2012), 2012. @techreport{AB2012b, title = {FOCUS: An Index FOr ContinuoUS Subsequence Pattern Queries}, author = {Witold Andrzejewski and Bartosz Bębel}, year = {2012}, date = {2012-06-01}, number = {RA 04/2012}, address = {Piotrowo 2, 60-965 Poznan, Poland}, institution = {Poznan University Of Technology}, abstract = {Recent appearance of the a type of OLAP analysis, the sequential OLAP (or SOLAP) has caused the need for new index structures which support new types of analytical queries. An integral part of processing SOLAP queries is finding sequences which match a user-specified pattern. We call such queries \\emph{subsequence pattern queries}. The contribution of this paper is threefold: first, we propose logical and physical index structure which supports subsequence pattern queries, second, we extend this structure to support aggregation queries and third, we perform performance experiments which show that our solutions offer orders of magnitude improvement over previous state of the art solutions.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Recent appearance of the a type of OLAP analysis, the sequential OLAP (or SOLAP) has caused the need for new index structures which support new types of analytical queries. An integral part of processing SOLAP queries is finding sequences which match a user-specified pattern. We call such queries \emph{subsequence pattern queries}. The contribution of this paper is threefold: first, we propose logical and physical index structure which supports subsequence pattern queries, second, we extend this structure to support aggregation queries and third, we perform performance experiments which show that our solutions offer orders of magnitude improvement over previous state of the art solutions. |
2011 |
Andrzejewski, Witold (in polish) Indeks dla zapytań o ciągłe wystąpienia wzorców Inproceedings Materiały konferencyjne PLOUG 2011, 2011. @inproceedings{A2011, title = {(in polish) Indeks dla zapytań o ciągłe wystąpienia wzorców}, author = {Witold Andrzejewski}, url = {http://www.ploug.org.pl/konf_11/materialy/pdf/01_andrzejewski_indeks.pdf}, year = {2011}, date = {2011-10-01}, booktitle = {Materiały konferencyjne PLOUG 2011}, abstract = {Opracowany w ostatnich latach nowy typ analitycznego przetwarzania danych, rozszerzający standardowy OLAP na dane sekwencyjne, tzw. Sequential OLAP (SOLAP), wprowadził nowe nowe sposoby przetwarzania i odpytywania danych. Integralną częścią przetwarzania SOLAP jest odszukiwanie sekwencji, w których występuje zdefiniowany przez użytkownika wzorzec. Ze względu na konieczność nietypowego przetwarzania oraz przeszukiwania danych z uwzględnieniem zdefiniowanego na nich porządku, standardowe struktury indeksowe okazują się być niewystarczające dla wydajnej realizacji zapytań SOLAP. W niniejszym artykule przedstawiono strukturę indeksową wspierającą zapytania o sekwencje zawierające zadane wzorce, jej implementację w środowisku Oracle Database oraz wyniki eksperymentalne potwierdzające znacznie zwiększoną wydajność realizacji zapytań o wystąpienia wzorców w stosunku do standardowych metod realizacji tego typu zapytań.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Opracowany w ostatnich latach nowy typ analitycznego przetwarzania danych, rozszerzający standardowy OLAP na dane sekwencyjne, tzw. Sequential OLAP (SOLAP), wprowadził nowe nowe sposoby przetwarzania i odpytywania danych. Integralną częścią przetwarzania SOLAP jest odszukiwanie sekwencji, w których występuje zdefiniowany przez użytkownika wzorzec. Ze względu na konieczność nietypowego przetwarzania oraz przeszukiwania danych z uwzględnieniem zdefiniowanego na nich porządku, standardowe struktury indeksowe okazują się być niewystarczające dla wydajnej realizacji zapytań SOLAP. W niniejszym artykule przedstawiono strukturę indeksową wspierającą zapytania o sekwencje zawierające zadane wzorce, jej implementację w środowisku Oracle Database oraz wyniki eksperymentalne potwierdzające znacznie zwiększoną wydajność realizacji zapytań o wystąpienia wzorców w stosunku do standardowych metod realizacji tego typu zapytań. |
Andrzejewski, Witold; Wrembel, Robert GPU-PLWAH: GPU-based implementation of the PLWAH algorithm for compressing bitmaps Journal Article Control and Cybernetics, 40 (3), pp. 627–650, 2011. @article{AW2011, title = {GPU-PLWAH: GPU-based implementation of the PLWAH algorithm for compressing bitmaps}, author = {Witold Andrzejewski and Robert Wrembel}, url = {http://control.ibspan.waw.pl:3000/contents/export?filename=2011-3-03_Andrzejewski_Wrembel.pdf}, year = {2011}, date = {2011-07-01}, journal = {Control and Cybernetics}, volume = {40}, number = {3}, pages = {627--650}, abstract = {Bitmap indexes are data structures applied to indexing attributes in databases and data warehouses. A drawback of a bitmap index is that its size increases when the domain of an indexed attribute increases. As a consequence, for wide domains, the size of a bitmap index is too large to be efficiently processed. Hence, various techniques of compressing bitmap indexes have been proposed. A compression technique incurs some system overhead (mainly CPU) for compression and decompression operations. For this reason, we propose to use additional processing power of graphical processing units (GPUs). In this paper, we present the GPU-PLWAH algorithm that is a parallel implementation of the recently developed PLWAH compression algorithm. GPU-PLWAH was experimentally compared to its traditional CPU version as well as to our previously developed parallel GPU implementation of the WAH compression algorithm. The experiments show that applying GPUs significantly reduces compression/decompression time.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Bitmap indexes are data structures applied to indexing attributes in databases and data warehouses. A drawback of a bitmap index is that its size increases when the domain of an indexed attribute increases. As a consequence, for wide domains, the size of a bitmap index is too large to be efficiently processed. Hence, various techniques of compressing bitmap indexes have been proposed. A compression technique incurs some system overhead (mainly CPU) for compression and decompression operations. For this reason, we propose to use additional processing power of graphical processing units (GPUs). In this paper, we present the GPU-PLWAH algorithm that is a parallel implementation of the recently developed PLWAH compression algorithm. GPU-PLWAH was experimentally compared to its traditional CPU version as well as to our previously developed parallel GPU implementation of the WAH compression algorithm. The experiments show that applying GPUs significantly reduces compression/decompression time. |
2010 |
Andrzejewski, Witold; Gramacki, Artur; Gramacki, Jarosław (in polish) Przybliżone zapytania do baz danych z akceleracją obliczeń rozkładów prawdopodobieństwa Inproceedings Materiały konferencyjne PLOUG, 2010. @inproceedings{AGG2010, title = {(in polish) Przybliżone zapytania do baz danych z akceleracją obliczeń rozkładów prawdopodobieństwa}, author = {Witold Andrzejewski and Artur Gramacki and Jarosław Gramacki }, url = {http://www.ploug.org.pl/konf_10/materialy/pdf/17.pdf}, year = {2010}, date = {2010-09-01}, booktitle = {Materiały konferencyjne PLOUG}, abstract = {Artykuł pokazuje przykładowe zastosowanie architektury CUDA opracowanej przez firmę NVIDIA dla swoich kart graficznych. CUDA to uniwersalna architektura procesorów wielordzeniowych instalowanych we współczesnych, najbardziej wydajnych, kartach graficznych. Karta taka, oprócz oczywistych zastosowań w dziedzinie ogólnie pojętego przetwarzania obrazu, może być z powodzeniem wykorzystywana do wykonywania złożonych obliczeń numerycznych, zwłaszcza takich, które poddają się operacji zrównoleglenia (można wówczas efektywnie wykorzystywać moc zainstalowanych na karcie graficznej tzw. multiprocesorów strumieniowych). Jako przykład bardzo czasochłonnych obliczeń wybrano procedury wyznaczania tzw. parametrów wygładzania estymatorów jądrowych służących do wyznaczania rozkładów prawdopodobieństwa danych. Znajomość takich rozkładów pozwala na ekstremalnie szybkie wyznaczanie przybliżonych wyników zapytań agregujących. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Artykuł pokazuje przykładowe zastosowanie architektury CUDA opracowanej przez firmę NVIDIA dla swoich kart graficznych. CUDA to uniwersalna architektura procesorów wielordzeniowych instalowanych we współczesnych, najbardziej wydajnych, kartach graficznych. Karta taka, oprócz oczywistych zastosowań w dziedzinie ogólnie pojętego przetwarzania obrazu, może być z powodzeniem wykorzystywana do wykonywania złożonych obliczeń numerycznych, zwłaszcza takich, które poddają się operacji zrównoleglenia (można wówczas efektywnie wykorzystywać moc zainstalowanych na karcie graficznej tzw. multiprocesorów strumieniowych). Jako przykład bardzo czasochłonnych obliczeń wybrano procedury wyznaczania tzw. parametrów wygładzania estymatorów jądrowych służących do wyznaczania rozkładów prawdopodobieństwa danych. Znajomość takich rozkładów pozwala na ekstremalnie szybkie wyznaczanie przybliżonych wyników zapytań agregujących. |
Andrzejewski, Witold; Wrembel, Robert GPU-WAH: Applying GPUs to Compressing Bitmap Indexes with Word Aligned Hybrid Inproceedings Proceedings of the 21st international conference on Database and expert systems applications: Part II, pp. 315–329, Springer-Verlag, Berlin, Heidelberg, 2010. @inproceedings{AR2010a, title = {GPU-WAH: Applying GPUs to Compressing Bitmap Indexes with Word Aligned Hybrid}, author = {Witold Andrzejewski and Robert Wrembel}, url = {http://link.springer.com/content/pdf/10.1007/978-3-642-15251-1_26.pdf}, year = {2010}, date = {2010-09-01}, booktitle = {Proceedings of the 21st international conference on Database and expert systems applications: Part II}, number = {6262}, pages = {315--329}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {Lecture Notes in Computer Science}, abstract = {Bitmap indexes are one of the basic data structures applied to query optimization in data warehouses. The size of a bitmap index strongly depends on the domain of an indexed attribute, and for wide domains it is too large to be efficiently processed. For this reason, various techniques of compressing bitmap indexes have been proposed. Typically, compressed indexes have to be decompressed before being used by a query optimizer that incurs a CPU overhead and deteriorates the performance of a system. For this reason, we propose to use additional processing power of the GPUs of modern graphics cards for compressing and decompressing bitmap indexes. In this paper we present a modification of the well known WAH compression technique so that it can be executed and parallelized on modern GPUs.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Bitmap indexes are one of the basic data structures applied to query optimization in data warehouses. The size of a bitmap index strongly depends on the domain of an indexed attribute, and for wide domains it is too large to be efficiently processed. For this reason, various techniques of compressing bitmap indexes have been proposed. Typically, compressed indexes have to be decompressed before being used by a query optimizer that incurs a CPU overhead and deteriorates the performance of a system. For this reason, we propose to use additional processing power of the GPUs of modern graphics cards for compressing and decompressing bitmap indexes. In this paper we present a modification of the well known WAH compression technique so that it can be executed and parallelized on modern GPUs. |
Andrzejewski, Witold; Wrembel, Robert GPU-PLWAH: GPU-based Implementation of the PLWAH Algorithm for Compressing Bitmaps Inproceedings Proceedings of the KKNTPD 2010 Conference, pp. 56–70, Wydawnictwa Naukowo-Techniczne, 2010. @inproceedings{AR2010b, title = {GPU-PLWAH: GPU-based Implementation of the PLWAH Algorithm for Compressing Bitmaps}, author = {Witold Andrzejewski and Robert Wrembel}, year = {2010}, date = {2010-09-01}, booktitle = {Proceedings of the KKNTPD 2010 Conference}, pages = {56--70}, publisher = {Wydawnictwa Naukowo-Techniczne}, abstract = {Bitmap indexes are data structures applied to indexing attributes in databases and data warehouses. A drawback of a bitmap index is that its size increases when the domain of an indexed attribute increases. As a consequence, for wide domains, the size of a bitmap index is too large to be efficiently processed. Hence, various techniques of compressing bitmap indexes have been proposed. A compression technique incurs some system overhead (mainly CPU) for compression and decompression operations. For this reason, we propose to use additional processing power of graphical processing units (GPUs). In this paper, we present the GPU-PLWAH algorithm that is a parallel implementation of the recently developed PLWAH compression algorithm. GPU-PLWAH was experimentally compared to its traditional CPU version as well as to our previously developed parallel GPU implementation of the WAH compression algorithm. The experiments show that applying GPUs significantly reduces compression/decompression time.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Bitmap indexes are data structures applied to indexing attributes in databases and data warehouses. A drawback of a bitmap index is that its size increases when the domain of an indexed attribute increases. As a consequence, for wide domains, the size of a bitmap index is too large to be efficiently processed. Hence, various techniques of compressing bitmap indexes have been proposed. A compression technique incurs some system overhead (mainly CPU) for compression and decompression operations. For this reason, we propose to use additional processing power of graphical processing units (GPUs). In this paper, we present the GPU-PLWAH algorithm that is a parallel implementation of the recently developed PLWAH compression algorithm. GPU-PLWAH was experimentally compared to its traditional CPU version as well as to our previously developed parallel GPU implementation of the WAH compression algorithm. The experiments show that applying GPUs significantly reduces compression/decompression time. |
Andrzejewski, Witold; Kaźmierczak, Tomasz (in polish) Wydajne grupowanie obiektów metodą K-Medoids z wykorzystaniem technologii CUDA Inproceedings Proceedings of the KKNTPD 2010 Conference, 2010. @inproceedings{AK2010b, title = {(in polish) Wydajne grupowanie obiektów metodą K-Medoids z wykorzystaniem technologii CUDA}, author = {Witold Andrzejewski and Tomasz Kaźmierczak}, year = {2010}, date = {2010-09-01}, booktitle = {Proceedings of the KKNTPD 2010 Conference}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Andrzejewski, Witold An Index for Continuous Subsequence Pattern Queries Miscellaneous Technical Report, 2010. @misc{A2010, title = {An Index for Continuous Subsequence Pattern Queries}, author = {Witold Andrzejewski}, year = {2010}, date = {2010-06-01}, howpublished = {Technical Report}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Andrzejewski, Witold; Kaźmierczak, Tomasz Fast Clustering on CUDA Platform Journal Article Foundations of Computing and Decision Sciences, 35 (4), pp. 241–260, 2010, ISSN: 0867-6356. @article{AK2010, title = {Fast Clustering on CUDA Platform}, author = {Witold Andrzejewski and Tomasz Kaźmierczak}, url = {http://fcds.cs.put.poznan.pl/FCDS2/ArticleDetails.aspx?articleId=239}, issn = {0867-6356}, year = {2010}, date = {2010-01-01}, journal = {Foundations of Computing and Decision Sciences}, volume = {35}, number = {4}, pages = {241--260}, abstract = {K-Medoids clustering is very expensive. The basic algorithm PAM (Partitioning Around Medoids) does not scale very well for bigger datasets. To cope with this problem many modifications of the PAM algorithm, have been developed (ie. CLARANS). Unfortunately larger datasets still need to be clustered on computers whose computing power exceeds those of the normal desktop PCs. In this paper we present modifications of K-Medoids clustering algorithms PAM and CLARANS (Clustering Large Applications based on RANdomized Search) which utilize the graphics processing units (GPUs) of the modern graphics cards through CUDA (Compute Unified Device Architecture) platform to accelerate the most costly stages of these algorithms. We also present results of extensive performance experiments which show high improvements over old versions of these algorithms.}, keywords = {}, pubstate = {published}, tppubtype = {article} } K-Medoids clustering is very expensive. The basic algorithm PAM (Partitioning Around Medoids) does not scale very well for bigger datasets. To cope with this problem many modifications of the PAM algorithm, have been developed (ie. CLARANS). Unfortunately larger datasets still need to be clustered on computers whose computing power exceeds those of the normal desktop PCs. In this paper we present modifications of K-Medoids clustering algorithms PAM and CLARANS (Clustering Large Applications based on RANdomized Search) which utilize the graphics processing units (GPUs) of the modern graphics cards through CUDA (Compute Unified Device Architecture) platform to accelerate the most costly stages of these algorithms. We also present results of extensive performance experiments which show high improvements over old versions of these algorithms. |
Gramacki, Artur; Gramacki, Jarosław; Andrzejewski, Witold Probability Density Functions for Calculating Approximate Aggregates Journal Article Foundations of Computing and Decision Sciences, 35 (4), pp. 223–240, 2010. @article{GGA2010, title = {Probability Density Functions for Calculating Approximate Aggregates}, author = {Artur Gramacki and Jarosław Gramacki and Witold Andrzejewski}, url = {http://fcds.cs.put.poznan.pl/FCDS2/ArticleDetails.aspx?articleId=238}, year = {2010}, date = {2010-01-01}, journal = {Foundations of Computing and Decision Sciences}, volume = {35}, number = {4}, pages = {223--240}, abstract = {In the paper we show how one can use probability density function (PDF) for calculating approximate aggregates. The aggregates can be obtained very quickly and efficiently and there is no need to look through the large amount of data, as well as creating a sort of materialized aggregates (usually implemented as materialized views). Although the final results are only approximate, the method is extremely fast and can be successively used during initial phase of data exploration. We include simple experimental results which proof effectiveness of the method, especially if PDFs are typical, for example similar to Gaussian normal ones. If the PDFs differ from a normal distribution, one can consider making a proper preliminary transformation of the input variables or estimate PDFs by some nonparametric methods, for example using the so called kernel estimators. The later is used in the paper. To accelerate calculations, one can consider a usage of graphics processing unit (GPU). We point out this approach in the last section of the paper and give some preliminary results which are very promising.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In the paper we show how one can use probability density function (PDF) for calculating approximate aggregates. The aggregates can be obtained very quickly and efficiently and there is no need to look through the large amount of data, as well as creating a sort of materialized aggregates (usually implemented as materialized views). Although the final results are only approximate, the method is extremely fast and can be successively used during initial phase of data exploration. We include simple experimental results which proof effectiveness of the method, especially if PDFs are typical, for example similar to Gaussian normal ones. If the PDFs differ from a normal distribution, one can consider making a proper preliminary transformation of the input variables or estimate PDFs by some nonparametric methods, for example using the so called kernel estimators. The later is used in the paper. To accelerate calculations, one can consider a usage of graphics processing unit (GPU). We point out this approach in the last section of the paper and give some preliminary results which are very promising. |
Andrzejewski, Witold; Królikowski, Zbyszko; Morzy, Tadeusz (in polish) Bazy danych i systemy informatyczne oraz ich wpływ na rozwój informatyki w Polsce Book Chapter Polskie i światowe osiągnięcia nauki: Nauki techniczne, pp. 345–388, Gliwice, 2010. @inbook{AZM2010, title = {(in polish) Bazy danych i systemy informatyczne oraz ich wpływ na rozwój informatyki w Polsce}, author = {Witold Andrzejewski and Zbyszko Królikowski and Tadeusz Morzy}, url = {http://fundacjarozwojunauki.pl/res/Tom2/10_Morzy.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Polskie i światowe osiągnięcia nauki: Nauki techniczne}, pages = {345--388}, address = {Gliwice}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } |
2009 |
Andrzejewski, Witold; Królikowski, Zbyszko; Morzy, Tadeusz How to improve efficiency of analysis of sequential data? Journal Article Control and Cybernetics, 38 (1), pp. 107–126, 2009. @article{AKM2009, title = {How to improve efficiency of analysis of sequential data?}, author = {Witold Andrzejewski and Zbyszko Królikowski and Tadeusz Morzy}, url = {http://matwbn.icm.edu.pl/ksiazki/cc/cc38/cc3817.pdf}, year = {2009}, date = {2009-01-01}, journal = {Control and Cybernetics}, volume = {38}, number = {1}, pages = {107--126}, abstract = {Many of todays database applications, including market basket analysis, web log analysis, DNA and protein sequence analysis utilize databases to store and retrieve sequential data. Commercial database management systems allow to store sequential data, but they do not support efficient querying of such data. To increase the efficiency of analysis of sequential data new index structures need to be developed. In this paper we propose an indexing scheme for non-timestamped sequences of sets, which supports set subsequence queries. Our contribution is threefold. First, we describe the index logical and physical structure, second, we provide algorithms for set subsequence queries utilizing this structure, and finally we perform experimental evaluation of the index, which proves its feasibility and advantages in set subsequence query processing. }, keywords = {}, pubstate = {published}, tppubtype = {article} } Many of todays database applications, including market basket analysis, web log analysis, DNA and protein sequence analysis utilize databases to store and retrieve sequential data. Commercial database management systems allow to store sequential data, but they do not support efficient querying of such data. To increase the efficiency of analysis of sequential data new index structures need to be developed. In this paper we propose an indexing scheme for non-timestamped sequences of sets, which supports set subsequence queries. Our contribution is threefold. First, we describe the index logical and physical structure, second, we provide algorithms for set subsequence queries utilizing this structure, and finally we perform experimental evaluation of the index, which proves its feasibility and advantages in set subsequence query processing. |
Andrzejewski, Witold; Królikowski, Zbyszko; Masewicz, Mariusz; Wrembel, Robert Predicting access to materialized methods by means of hidden Markov model Journal Article Control and Cybernetics, 38 (1), pp. 128–152, 2009. @article{AKMW2009, title = {Predicting access to materialized methods by means of hidden Markov model}, author = {Witold Andrzejewski and Zbyszko Królikowski and Mariusz Masewicz and Robert Wrembel}, url = {http://matwbn.icm.edu.pl/ksiazki/cc/cc38/cc3818.pdf}, year = {2009}, date = {2009-01-01}, journal = {Control and Cybernetics}, volume = {38}, number = {1}, pages = {128--152}, abstract = {Method materialization is a promising data access optimization technique for multiple applications, including, in particular object programming languages with persistence, object databases, distributed computing systems, object-relational data warehouses, multimedia data warehouses, and spatial data warehouses. A drawback of this technique is that the value of a materialized method becomes invalid when an object used for computing the value of the method is updated. As a consequence, a materialized value of the method has to be recomputed. The materialized value can be recomputed either immediately after updating the object or just before calling the method. The moment the method is recomputed bears a strong impact on the overall system performance. In this paper we propose a technique of predicting access to materialized methods and objects, for the purpose of selecting the most appropriate recomputation technique. The prediction technique is based on the Hidden Markov Model (HMM). The prediction technique was implemented and evaluated experimentally. Its performance characteristics were compared to: immediate recomputation, deferred recomputation, random recomputation, and to our previous prediction technique, called a PMAP.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Method materialization is a promising data access optimization technique for multiple applications, including, in particular object programming languages with persistence, object databases, distributed computing systems, object-relational data warehouses, multimedia data warehouses, and spatial data warehouses. A drawback of this technique is that the value of a materialized method becomes invalid when an object used for computing the value of the method is updated. As a consequence, a materialized value of the method has to be recomputed. The materialized value can be recomputed either immediately after updating the object or just before calling the method. The moment the method is recomputed bears a strong impact on the overall system performance. In this paper we propose a technique of predicting access to materialized methods and objects, for the purpose of selecting the most appropriate recomputation technique. The prediction technique is based on the Hidden Markov Model (HMM). The prediction technique was implemented and evaluated experimentally. Its performance characteristics were compared to: immediate recomputation, deferred recomputation, random recomputation, and to our previous prediction technique, called a PMAP. |
2007 |
Andrzejewski, Witold; Królikowski, Zbyszko; Morzy, Tadeusz How to Improve Efficiency of Analysis of Sequential Data? Inproceedings Proceedings of the KKNTPD 2007 conference, 2007. @inproceedings{AKM2007, title = {How to Improve Efficiency of Analysis of Sequential Data?}, author = {Witold Andrzejewski and Zbyszko Królikowski and Tadeusz Morzy}, year = {2007}, date = {2007-09-01}, booktitle = {Proceedings of the KKNTPD 2007 conference}, abstract = {In order to extract useful knowledge from large databases of sales data, data mining algorithms (the so-called market basket analysis) are used. Unfortunately, these algorithms, depending on data and parameters, may generate a large number of patterns. Analysis of these results is performed by the user and involves executing a lot of queries on complex data types that are not well supported by commercially available database management systems. To increase efficiency of analysis of data mining results, new index structures need to be developed. In this paper we propose the indexing scheme for non-timestamped sequences of sets, which supports set subsequence queries. Experimental evaluation of the index proves the feasibility and benefit of the index in query processing.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In order to extract useful knowledge from large databases of sales data, data mining algorithms (the so-called market basket analysis) are used. Unfortunately, these algorithms, depending on data and parameters, may generate a large number of patterns. Analysis of these results is performed by the user and involves executing a lot of queries on complex data types that are not well supported by commercially available database management systems. To increase efficiency of analysis of data mining results, new index structures need to be developed. In this paper we propose the indexing scheme for non-timestamped sequences of sets, which supports set subsequence queries. Experimental evaluation of the index proves the feasibility and benefit of the index in query processing. |
Andrzejewski, Witold; Królikowski, Zbyszko; Masewicz, Mariusz; Wrembel, Robert (in polish) Ukryte modele Markowa jako mechanizm automatycznej predykcji żądań użytkowników w systemach z materializacją hierachiczną Inproceedings Proceedings of the KKNTPD 2007 Conference, 2007. @inproceedings{AKMW2007, title = {(in polish) Ukryte modele Markowa jako mechanizm automatycznej predykcji żądań użytkowników w systemach z materializacją hierachiczną}, author = {Witold Andrzejewski and Zbyszko Królikowski and Mariusz Masewicz and Robert Wrembel}, year = {2007}, date = {2007-09-01}, booktitle = {Proceedings of the KKNTPD 2007 Conference}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Andrzejewski, Witold Fast K-Medoids Clustering on PCs Inproceedings Morzy, Tadeusz; Morzy, Mikołaj; Nanopoulos, Alexandros (Ed.): Proceedings of the ADMKD 2007 workshop, pp. 29–44, Technical University of Varna, 2007. @inproceedings{A2007, title = {Fast K-Medoids Clustering on PCs}, author = {Witold Andrzejewski}, editor = {Tadeusz Morzy and Mikołaj Morzy and Alexandros Nanopoulos}, year = {2007}, date = {2007-09-01}, booktitle = {Proceedings of the ADMKD 2007 workshop}, pages = {29--44}, publisher = {Technical University of Varna}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2006 |
Andrzejewski, Witold; Morzy, Tadeusz AISS: An Index for Non-timestamped Set Subsequence Queries Inproceedings Tjoa, Min A; Trujillo, Juan (Ed.): Proceedings of the 8th International Conference on DataWarehousing and Knowledge Discovery DaWaK 2006, Krakow, Poland, pp. 503–512, Springer-Verlag, 2006, ISBN: 0302-9743. @inproceedings{AM2006, title = {AISS: An Index for Non-timestamped Set Subsequence Queries}, author = {Witold Andrzejewski and Tadeusz Morzy}, editor = {A Min Tjoa and Juan Trujillo}, url = {http://link.springer.com/chapter/10.1007%2F11823728_48}, isbn = {0302-9743}, year = {2006}, date = {2006-09-04}, booktitle = {Proceedings of the 8th International Conference on DataWarehousing and Knowledge Discovery DaWaK 2006, Krakow, Poland}, number = {4081}, pages = {503--512}, publisher = {Springer-Verlag}, series = {Lecture Notes in Computer Science}, abstract = {In many recent applications of database management systems data may be stored in user defined complex data types (such as sequences). However, efficient querying of such data is not supported by commercially available database management systems and therefore efficient indexing schemes for complex data types need to be developed. In this paper we focus primarily on the indexing of non-timestamped sequences of sets of categorical data, specifically indexing for set subsequence queries. We address bot: logical structure and implementation issues of such indexes. Our main contributions are threefold. First, we specify the logical structure of the index and we propose algorithms for set subsequence query execution, which utilize the index structure. Second, we provide the proposition for the implementation of such index, which uses means available in all of the \"of the shelf\" database management systems. Finally, we experimentally evaluate the performance of the index.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In many recent applications of database management systems data may be stored in user defined complex data types (such as sequences). However, efficient querying of such data is not supported by commercially available database management systems and therefore efficient indexing schemes for complex data types need to be developed. In this paper we focus primarily on the indexing of non-timestamped sequences of sets of categorical data, specifically indexing for set subsequence queries. We address bot: logical structure and implementation issues of such indexes. Our main contributions are threefold. First, we specify the logical structure of the index and we propose algorithms for set subsequence query execution, which utilize the index structure. Second, we provide the proposition for the implementation of such index, which uses means available in all of the "of the shelf" database management systems. Finally, we experimentally evaluate the performance of the index. |
Andrzejewski, Witold (in polish) Implementacja indeksów dla analizy wyników eksploracji danych w Oracle 10g Inproceedings Proceedings of the PLOUG 2006 Conference, pp. 311-328, 2006. @inproceedings{A2006, title = {(in polish) Implementacja indeksów dla analizy wyników eksploracji danych w Oracle 10g}, author = {Witold Andrzejewski}, url = {http://www.ploug.org.pl/konf_06/materialy/pdf/21_Andrzejewski.pdf}, year = {2006}, date = {2006-09-01}, booktitle = {Proceedings of the PLOUG 2006 Conference}, pages = {311-328}, abstract = {Olbrzymie bazy danych dotyczących sprzedaży produktów w supermarketach są zbyt duże, by można na nich przeprowa-dzić „ręczną” analizę. Odpowiedzią na ten problem jest użycie algorytmów eksploracji danych, czyli tzw. \"analizy koszyka sklepowego\", do automatycznej analizy danych w celu otrzymania użytecznej wiedzy. W wyniku takiej analizy (w zależno-ści od parametrów algorytmów) można otrzymać olbrzymią liczbę wzorców, takich jak: zbiory częste, reguły asocjacyjne albo wzorce sekwencji. Wśród tych wzorców znajduje się wiele zależności, które są przypadkowe, nieciekawe, bądź już znane. Zadaniem człowieka jest odnalezienie wśród wyników eksploracji danych wzorców ciekawych i o praktycznym zastosowaniu. Taka analiza wymaga wykonania wielu zapytań do danych o złożonych typach, które są słabo wspierane przez komercyjne systemy zarządzania bazą danych. Rozwiązanie tego problemu jest możliwe w Oracle 10g, dzięki mecha-nizmowi Data Cartridges pozwalającemu na rozszerzanie funkcjonalności serwera Oracle. Pozwala on na implementację własnych indeksów wspierających różne typy zapytań na danych o złożonych typach. W niniejszej publikacji przedstawio-no rozwiązania pozwalające na indeksowanie złożonych typów danych charakterystycznych dla analizy koszyka sklepowe-go, oraz sposób ich implementacji przy wykorzystaniu machizmu Data Cartridges.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Olbrzymie bazy danych dotyczących sprzedaży produktów w supermarketach są zbyt duże, by można na nich przeprowa-dzić „ręczną” analizę. Odpowiedzią na ten problem jest użycie algorytmów eksploracji danych, czyli tzw. "analizy koszyka sklepowego", do automatycznej analizy danych w celu otrzymania użytecznej wiedzy. W wyniku takiej analizy (w zależno-ści od parametrów algorytmów) można otrzymać olbrzymią liczbę wzorców, takich jak: zbiory częste, reguły asocjacyjne albo wzorce sekwencji. Wśród tych wzorców znajduje się wiele zależności, które są przypadkowe, nieciekawe, bądź już znane. Zadaniem człowieka jest odnalezienie wśród wyników eksploracji danych wzorców ciekawych i o praktycznym zastosowaniu. Taka analiza wymaga wykonania wielu zapytań do danych o złożonych typach, które są słabo wspierane przez komercyjne systemy zarządzania bazą danych. Rozwiązanie tego problemu jest możliwe w Oracle 10g, dzięki mecha-nizmowi Data Cartridges pozwalającemu na rozszerzanie funkcjonalności serwera Oracle. Pozwala on na implementację własnych indeksów wspierających różne typy zapytań na danych o złożonych typach. W niniejszej publikacji przedstawio-no rozwiązania pozwalające na indeksowanie złożonych typów danych charakterystycznych dla analizy koszyka sklepowe-go, oraz sposób ich implementacji przy wykorzystaniu machizmu Data Cartridges. |
Andrzejewski, Witold; Morzy, Tadeusz SeqTrie: An Index for Data Mining Applications Inproceedings Proceedings of the ADMKD 2006 Workshop, 2006. @inproceedings{AM2006, title = {SeqTrie: An Index for Data Mining Applications}, author = {Witold Andrzejewski and Tadeusz Morzy}, year = {2006}, date = {2006-09-01}, booktitle = {Proceedings of the ADMKD 2006 Workshop}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2005 |
Andrzejewski, Witold; Morzy, Tadeusz; Morzy, Mikołaj Indexing of sequences of sets for efficient exact and similar subsequence matching Inproceedings Proceedings of the ISCIS 2005 Conference, pp. 864–873, Springer-Verlag, Berlin, Heidelberg, 2005. @inproceedings{AMM2005, title = {Indexing of sequences of sets for efficient exact and similar subsequence matching}, author = {Witold Andrzejewski and Tadeusz Morzy and Mikołaj Morzy}, url = {http://www.cs.put.poznan.pl/mmorzy/papers/iscis05.pdf}, year = {2005}, date = {2005-09-01}, booktitle = {Proceedings of the ISCIS 2005 Conference}, number = {3733}, pages = {864--873}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {Lecture Notes in Computer Science}, abstract = {Object-relational database management systems allow users to define complex data types, such as objects, collections, and nested tables. Unfortunately, most commercially available database systems do not support either efficient querying or indexing of complex attributes. Different indexing schemes for complex data types have been proposed in the literature so far, most of them being application-oriented proposals. The lack of a single universal indexing technique for attributes containing sets and sequences of values significantly hinders practical usability of these data types in user applications. In this paper we present a novel indexing technique for sequence-valued attributes. Our index permits to index not only sequences of values, but sequences of sets of values as well. Experimental evaluation of the index proves the feasibility and benefit of the index in exact and similar matching of subsequences.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Object-relational database management systems allow users to define complex data types, such as objects, collections, and nested tables. Unfortunately, most commercially available database systems do not support either efficient querying or indexing of complex attributes. Different indexing schemes for complex data types have been proposed in the literature so far, most of them being application-oriented proposals. The lack of a single universal indexing technique for attributes containing sets and sequences of values significantly hinders practical usability of these data types in user applications. In this paper we present a novel indexing technique for sequence-valued attributes. Our index permits to index not only sequences of values, but sequences of sets of values as well. Experimental evaluation of the index proves the feasibility and benefit of the index in exact and similar matching of subsequences. |
Andrzejewski, Witold; Morzy, Tadeusz; Morzy, Mikołaj Indexing of sequential patterns for efficient analysis of data mining results Inproceedings Proceedings of the ADMKD 2005 Workshop, 2005. @inproceedings{AMM2005, title = {Indexing of sequential patterns for efficient analysis of data mining results}, author = {Witold Andrzejewski and Tadeusz Morzy and Mikołaj Morzy}, year = {2005}, date = {2005-09-01}, booktitle = {Proceedings of the ADMKD 2005 Workshop}, abstract = {Mining sequential patterns allows the analysis of large databases of sales data that are not susceptible for investigation “by hand”. Unfortunately, data mining algorithms produce a large number of results, that often exceed the size of the original database. Analysis of data mining results is usually performed by the end user of the data mining application manually. During this process a large number of subsequence queries are executed. Such queries are not well supported by traditional database management systems. In this paper we present a novel indexing technique for sequences of sets such as sequential patterns or sales history. Experimental evaluation of the index proves the feasibility and benefit of the index in exact and similar matching of subsequences.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Mining sequential patterns allows the analysis of large databases of sales data that are not susceptible for investigation “by hand”. Unfortunately, data mining algorithms produce a large number of results, that often exceed the size of the original database. Analysis of data mining results is usually performed by the end user of the data mining application manually. During this process a large number of subsequence queries are executed. Such queries are not well supported by traditional database management systems. In this paper we present a novel indexing technique for sequences of sets such as sequential patterns or sales history. Experimental evaluation of the index proves the feasibility and benefit of the index in exact and similar matching of subsequences. |
Andrzejewski, Witold; Morzy, Tadeusz (in polish) Nowy indeks wspierający zapytania przybliżone dla sekwencji zbiorów Inproceedings Proceedings of the KNTPD 2005 Conference, 2005. @inproceedings{AM2005, title = {(in polish) Nowy indeks wspierający zapytania przybliżone dla sekwencji zbiorów}, author = {Witold Andrzejewski and Tadeusz Morzy}, year = {2005}, date = {2005-09-01}, booktitle = {Proceedings of the KNTPD 2005 Conference}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Andrzejewski, Witold; Królikowski, Zbyszko; Morzy, Mikołaj; Morzy, Tadeusz (in polish) Hierarchiczny indeks bitmapowy wspierający wykonywanie zapytań w bazach danych z atrybutami zawierającymi zbiory Journal Article Archiwum Informatyki Teoretycznej i Stosowanej, 17 (3), pp. 213–228, 2005. @article{AKMM2005, title = {(in polish) Hierarchiczny indeks bitmapowy wspierający wykonywanie zapytań w bazach danych z atrybutami zawierającymi zbiory}, author = {Witold Andrzejewski and Zbyszko Królikowski and Mikołaj Morzy and Tadeusz Morzy}, year = {2005}, date = {2005-09-01}, journal = {Archiwum Informatyki Teoretycznej i Stosowanej}, volume = {17}, number = {3}, pages = {213--228}, abstract = {Set attributes are a natural and convenient means for modelling complex objects of the real world. Modern object-relational database systems enable us to store sets as single attributes and provide a limited support for queries on such attributes. Effective execution of queries in database systems is possible due to index structures, which considerably speed up data access In the first part of this paper we make a short revision of already existing set indexing techniques. In the second part we introduce a novel index for set attributes called Hierarchical Bitmap Index, which supports subset, superset, equality and similarity queries, as well as generalized versions of these queries. Atrybuty zawierające zbiory są naturalnym i wygodnym sposobem modelowania złożonych obiektów świata rzeczywistego. Współczesne relacyjno-obiektowe systemy baz danych umożliwiają przechowywanie zbiorów w postaci pojedynczych atrybutów oraz wspierają w określonym zakresie wydawanie zapytań dotyczących takich atrybutów. Efektywne wykonywanie zapytań w systemie bazy danych jest możliwe między innymi dzięki indeksom, które wydatnie przyspieszają dostęp do danych. W pierwszej części niniejszej pracy przedstawiono przegląd aktualnie dostępnych technik indeksowania zbiorów. W kolejnej części, przedstawiono propozycję nowego indeksu dla baz danych przechowujących zbiory, tj. hierarchicznego indeksu bitmapowego, wspierającego wykonywanie zapytań o nadzbiory, podzbiory, zbiory identyczne, zbiory podobne, jak również uogólnione wersje tych zapytań.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Set attributes are a natural and convenient means for modelling complex objects of the real world. Modern object-relational database systems enable us to store sets as single attributes and provide a limited support for queries on such attributes. Effective execution of queries in database systems is possible due to index structures, which considerably speed up data access In the first part of this paper we make a short revision of already existing set indexing techniques. In the second part we introduce a novel index for set attributes called Hierarchical Bitmap Index, which supports subset, superset, equality and similarity queries, as well as generalized versions of these queries. Atrybuty zawierające zbiory są naturalnym i wygodnym sposobem modelowania złożonych obiektów świata rzeczywistego. Współczesne relacyjno-obiektowe systemy baz danych umożliwiają przechowywanie zbiorów w postaci pojedynczych atrybutów oraz wspierają w określonym zakresie wydawanie zapytań dotyczących takich atrybutów. Efektywne wykonywanie zapytań w systemie bazy danych jest możliwe między innymi dzięki indeksom, które wydatnie przyspieszają dostęp do danych. W pierwszej części niniejszej pracy przedstawiono przegląd aktualnie dostępnych technik indeksowania zbiorów. W kolejnej części, przedstawiono propozycję nowego indeksu dla baz danych przechowujących zbiory, tj. hierarchicznego indeksu bitmapowego, wspierającego wykonywanie zapytań o nadzbiory, podzbiory, zbiory identyczne, zbiory podobne, jak również uogólnione wersje tych zapytań. |
Andrzejewski, Witold; Królikowski, Zbyszko; Morzy, Mikołaj; Morzy, Tadeusz (in polish) Ocena efektywności hierarchicznego indeksu bitmapowego wspierającego wykonywanie zapytań w bazach danych z atrybutami zawierającymi zbiory Journal Article Archiwum Informatyki Teoretycznej i Stosowanej, 17 (4), pp. 273–288, 2005. @article{AKMM2005b, title = {(in polish) Ocena efektywności hierarchicznego indeksu bitmapowego wspierającego wykonywanie zapytań w bazach danych z atrybutami zawierającymi zbiory}, author = {Witold Andrzejewski and Zbyszko Królikowski and Mikołaj Morzy and Tadeusz Morzy}, year = {2005}, date = {2005-09-01}, journal = {Archiwum Informatyki Teoretycznej i Stosowanej}, volume = {17}, number = {4}, pages = {273--288}, abstract = {In this paper we paper we present performance evaluation of the hierarchical bitmap index supporting processing of data mining queries. Performance evaluation includes the results of several experiments on processing of various data mining queries against databases with set valued attributes. Data mining queries were processed using the following strategies: a \"naive\" approach consisting in full database scan, inverted files, RD trees, S trees, and two types of the hierarchical bitmap index. Databases used through-out the experiments differed in the number of sets and the average sizes of sets stored in the database. The results of the conducted experiments show that the hierarchical bitmap index outperforms other indexing techniques with respect to the processing of equality, superset, and similarity queries. For subset queries the hierarchical bitmap index was surpassed only by the inverted file index. W niniejszej pracy przedstawiono ocenę efektywności hierarchicznego indeksu bitmapowego, wspierającego wykonywanie zapytań eksploracyjnych. W ramach badań wykonano zakrojony na szeroką skalę zestaw eksperymentów, obejmujących przetwarzanie zapytań eksploracyjnych wszystkich klas do baz danych z atrybutami zawierającymi zbiory. Zapytania były realizowane z wykorzystaniem \"naiwnego\" algorytmu wykonującego pełny przegląd bazy danych, pliku odwróconego, RD drzewa, S drzewa oraz dwóch odmian hierarchicznego indeksu bitmapowego. Bazy danych wykorzystywane w trakcie eksperymentów różniły się liczbą przechowywanych zbiorów oraz średnimi rozmiarami zbiorów. Wyniki pomiarów wykazały, że w porównaniu z innymi technikami stosowanymi do tej pory, hierarchiczny indeks bitmapowy charakteryzuje się najwyższą efektywnością przetwarzania dla zapytań o podzbiory, zapytań równościowych, oraz zapytań przybliżonych. Dla zapytań o nadzbiory efektywność hierarchicznego indeksu bitmapowego jest niższa jedynie od plików odwróconych.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this paper we paper we present performance evaluation of the hierarchical bitmap index supporting processing of data mining queries. Performance evaluation includes the results of several experiments on processing of various data mining queries against databases with set valued attributes. Data mining queries were processed using the following strategies: a "naive" approach consisting in full database scan, inverted files, RD trees, S trees, and two types of the hierarchical bitmap index. Databases used through-out the experiments differed in the number of sets and the average sizes of sets stored in the database. The results of the conducted experiments show that the hierarchical bitmap index outperforms other indexing techniques with respect to the processing of equality, superset, and similarity queries. For subset queries the hierarchical bitmap index was surpassed only by the inverted file index. W niniejszej pracy przedstawiono ocenę efektywności hierarchicznego indeksu bitmapowego, wspierającego wykonywanie zapytań eksploracyjnych. W ramach badań wykonano zakrojony na szeroką skalę zestaw eksperymentów, obejmujących przetwarzanie zapytań eksploracyjnych wszystkich klas do baz danych z atrybutami zawierającymi zbiory. Zapytania były realizowane z wykorzystaniem "naiwnego" algorytmu wykonującego pełny przegląd bazy danych, pliku odwróconego, RD drzewa, S drzewa oraz dwóch odmian hierarchicznego indeksu bitmapowego. Bazy danych wykorzystywane w trakcie eksperymentów różniły się liczbą przechowywanych zbiorów oraz średnimi rozmiarami zbiorów. Wyniki pomiarów wykazały, że w porównaniu z innymi technikami stosowanymi do tej pory, hierarchiczny indeks bitmapowy charakteryzuje się najwyższą efektywnością przetwarzania dla zapytań o podzbiory, zapytań równościowych, oraz zapytań przybliżonych. Dla zapytań o nadzbiory efektywność hierarchicznego indeksu bitmapowego jest niższa jedynie od plików odwróconych. |
2003 |
Morzy, Mikołaj; Królikowski, Zbyszko; Andrzejewski, Witold; Gaertig, Piotr; Radom, Marcin (in polish) Hierarchiczny indeks bitmapowy – nowa struktura do indeksowania atrybutów zbiorowych Miscellaneous Technical Report RB-010/03, 2003. @misc{MKAGR2003, title = {(in polish) Hierarchiczny indeks bitmapowy – nowa struktura do indeksowania atrybutów zbiorowych}, author = {Mikołaj Morzy and Zbyszko Królikowski and Witold Andrzejewski and Piotr Gaertig and Marcin Radom}, year = {2003}, date = {2003-09-01}, howpublished = {Technical Report RB-010/03}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Morzy, Mikołaj; Królikowski, Zbyszko; Andrzejewski, Witold; Antoniewicz, Mikołaj; Gaertig, Piotr; Radom, Marcin (in polish) Indeksowanie atrybutów zbiorowych – eksperymentalne porównanie istniejących technik Miscellaneous Technical Report RB-002/03, 2003. @misc{MKAAGR2003, title = {(in polish) Indeksowanie atrybutów zbiorowych – eksperymentalne porównanie istniejących technik}, author = {Mikołaj Morzy and Zbyszko Królikowski and Witold Andrzejewski and Mikołaj Antoniewicz and Piotr Gaertig and Marcin Radom}, year = {2003}, date = {2003-09-01}, howpublished = {Technical Report RB-002/03}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |