Semantically Rich Local Dataset Generation for Explainable AI in Genomics
Created by W.Langdon from
gp-bibliography.bib Revision:1.8051
- @InProceedings{barbosa:2024:GECCO,
-
author = "Pedro Barbosa and Rosina Savisaar and
Alcides Fonseca",
-
title = "Semantically Rich Local Dataset Generation for
Explainable {AI} in Genomics",
-
booktitle = "Proceedings of the 2024 Genetic and Evolutionary
Computation Conference",
-
year = "2024",
-
editor = "Jean-Baptiste Mouret and Kai Qin and Julia Handl and
Xiaodong Li and Markus Wagner and Mario Garza-Fabre and
Kate Smith-Miles and Richard Allmendinger and
Ying Bi and Grant Dick and Amir H Gandomi and
Marcella Scoczynski Ribeiro Martins and Hirad Assimi and
Nadarajen Veerapen and Yuan Sun and
Mario Andres Munyoz and Ahmed Kheiri and Nguyen Su and
Dhananjay Thiruvady and Andy Song and Frank Neumann and Carla Silva",
-
pages = "267--276",
-
address = "Melbourne, Australia",
-
series = "GECCO '24",
-
month = "14-18 " # jul,
-
organisation = "SIGEVO",
-
publisher = "Association for Computing Machinery",
-
publisher_address = "New York, NY, USA",
-
keywords = "genetic algorithms, genetic programming, XAI,
evolutionary computation, instance generation,
combinatorial optimization, local explainability, RNA
splicing, Evolutionary Machine Learning",
-
isbn13 = "979-8-4007-0494-9",
-
DOI = "doi:10.1145/3638529.3653990",
-
size = "10 pages",
-
abstract = "Black box deep learning models trained on genomic
sequences excel at predicting the outcomes of different
gene regulatory mechanisms. Therefore, interpreting
these models may provide novel insights into the
underlying biology, supporting downstream biomedical
applications. Due to their complexity, interpretable
surrogate models can only be built for local
explanations (e.g., a single instance). However,
accomplishing this requires generating a dataset in the
neighborhood of the input, which must maintain
syntactic similarity to the original data while
introducing semantic variability in the model's
predictions. This task is challenging due to the
complex sequence-to-function relationship of DNA.We
propose using Genetic Programming to generate datasets
by evolving perturbations in sequences that contribute
to their semantic diversity. Our custom, domain-guided
individual representation effectively constrains
syntactic similarity, and we provide two alternative
fitness functions that promote diversity with no
computational effort. Applied to the RNA splicing
domain, our approach quickly achieves good diversity
and significantly outperforms a random baseline in
exploring the search space, as shown by our
proof-of-concept, short RNA sequence. Furthermore, we
assess its generalizability and demonstrate scalability
to larger sequences, resulting in a approx30\%
improvement over the baseline.",
-
notes = "GECCO-2024 EML A Recombination of the 33rd
International Conference on Genetic Algorithms (ICGA)
and the 29th Annual Genetic Programming Conference
(GP)",
- }
Genetic Programming entries for
Pedro Santos Barbosa
Rosina Savisaar
Alcides Fonseca
Citations