Jaime Lorenzo-Trueba, Roberto Barra-Chicote, Junichi Yamagishi, Juan M. Montero (2014): Towards Cross-lingual Emotion Transplantation. In: Proc. Iberspeech 2014, 2014. (Type: Inproceeding | BibTeX | Tags: cross-lingual, emotion transplantation, expressive speech synthesis)@inproceedings{CrossLingualTransplantation2014,
title = {Towards Cross-lingual Emotion Transplantation},
author = {Jaime Lorenzo-Trueba and Roberto Barra-Chicote and Junichi Yamagishi and Juan M. Montero},
year = {2014},
date = {2014-10-22},
booktitle = {Proc. Iberspeech 2014},
volume = {1},
number = {1},
keywords = {cross-lingual, emotion transplantation, expressive speech synthesis}
}
|
A. Gallardo-Antolín, J.M. Montero, S. King (2014): A Comparison of Open-Source Segmentation Architectures for Dealing with Imperfect Data from the Media in Speech Synthesis. In: Proc. Interspeech 2014, 2014. (Type: Inproceeding | Abstract | BibTeX | Tags: expressive speech synthesis, speaker diarization, speaking styles, Speech synthesis)@inproceedings{Gallardo2014,
title = {A Comparison of Open-Source Segmentation Architectures for Dealing with Imperfect Data from the Media in Speech Synthesis},
author = {A. Gallardo-Antolín and J.M. Montero and S. King},
year = {2014},
date = {2014-09-17},
booktitle = {Proc. Interspeech 2014},
abstract = {A Comparison of Open-Source Segmentation Architectures for Dealing with Imperfect Data from the Media in Speech Synthesis},
keywords = {expressive speech synthesis, speaker diarization, speaking styles, Speech synthesis}
}
A Comparison of Open-Source Segmentation Architectures for Dealing with Imperfect Data from the Media in Speech Synthesis
|
J. Lorenzo-Trueba, J. D. Echeverry-Correa, R. Barra-Chicote, R. San-Segundo, J. Ferreiros, A. Gallardo-Antolín, J. Yamagishi, S. King, J. M. Montero (2014): Development of a Genre-Dependent TTS System with Cross-Speaker Speaking-Style Transplantation. In: ISCA/IEEE Workshop on Speech, Language and Audio in Multimedia (SLAM) 2014., 2014. (Type: Inproceeding | Abstract | BibTeX | Tags: expressive speech synthesis, genre detection, speaking styles, TTS)@inproceedings{LorenzoTrueba2014,
title = {Development of a Genre-Dependent TTS System with Cross-Speaker Speaking-Style Transplantation},
author = {J. Lorenzo-Trueba and J. D. Echeverry-Correa and R. Barra-Chicote and R. San-Segundo and J. Ferreiros and A. Gallardo-Antolín and J. Yamagishi and S. King and J. M. Montero},
year = {2014},
date = {2014-09-10},
booktitle = {ISCA/IEEE Workshop on Speech, Language and Audio in Multimedia (SLAM) 2014.},
journal = {, "Development of a Genre-Dependent TTS System with Cross-Speaker Speaking-Style Transplantation". Accepted in ISCA/IEEE Workshop on Speech, Language and Audio in Multimedia (SLAM) 2},
abstract = {Development of a Genre-Dependent TTS System with Cross-Speaker Speaking-Style Transplantation},
keywords = {expressive speech synthesis, genre detection, speaking styles, TTS}
}
Development of a Genre-Dependent TTS System with Cross-Speaker Speaking-Style Transplantation
|
J. Lorenzo-Trueba, R. Barra-Chicote, J. Yamagishi, O. Watts, J. M. Montero (2013): Towards Speaking Style Transplantation in Speech Synthesis. In: Proc. 8th ISCA Speech Synthesis Workshop, 2013. (Type: Inproceeding | Abstract | Links | BibTeX | Tags: Adaptation, expressive speech synthesis, speaking styles, transplantation)@inproceedings{Lorenzo-Trueba2013,
title = {Towards Speaking Style Transplantation in Speech Synthesis},
author = {J. Lorenzo-Trueba and R. Barra-Chicote and J. Yamagishi and O. Watts and J. M. Montero},
url = {http://www-gth.die.upm.es/research/documentation/AG-134Tow-13.pdf},
year = {2013},
date = {2013-09-01},
booktitle = {Proc. 8th ISCA Speech Synthesis Workshop},
abstract = {Towards Speaking Style Transplantation in Speech Synthesis},
keywords = {Adaptation, expressive speech synthesis, speaking styles, transplantation}
}
Towards Speaking Style Transplantation in Speech Synthesis
|
J. Lorenzo-Trueba, R. Barra-Chicote, J. Yamagishi, O. Watts, J.M. Montero (2013): Evaluation of a Transplantation Algorithm for Expressive Speech Synthesis. In: proccedings of Workshop en Tecnologías Accesibles, IV Congreso Español de Informática CEDI2013, 2013. (Type: Inproceeding | Abstract | BibTeX | Tags: expressive speech synthesis, transplantation)@inproceedings{Lorenzo-Trueba2013b,
title = {Evaluation of a Transplantation Algorithm for Expressive Speech Synthesis},
author = {J. Lorenzo-Trueba and R. Barra-Chicote and J. Yamagishi and O. Watts and J.M. Montero},
year = {2013},
date = {2013-09-01},
booktitle = {proccedings of Workshop en Tecnologías Accesibles, IV Congreso Español de Informática CEDI2013},
abstract = {Evaluation of a Transplantation Algorithm for Expressive Speech Synthesis},
keywords = {expressive speech synthesis, transplantation}
}
Evaluation of a Transplantation Algorithm for Expressive Speech Synthesis
|
Jaime Lorenzo-Trueba, Oliver Watts, Roberto Barra-Chicote, Junichi Yamagishi, Simon King, Juan M Montero (2012): Simple4All proposals for the Albayzin Evaluations in Speech Synthesis. In: Proc. Iberspeech 2012, 2012. (Type: Inproceeding | Abstract | Links | BibTeX | Tags: Albayzin challenge, expressive speech synthesis)@inproceedings{LorenzoAlbayzinProposal2012,
title = {Simple4All proposals for the Albayzin Evaluations in Speech Synthesis},
author = {Jaime Lorenzo-Trueba and Oliver Watts and Roberto Barra-Chicote and Junichi Yamagishi and Simon King and Juan M Montero},
url = {http://consortium.simple4all.org/files/2012/10/simple4all-proposal.pdf},
year = {2012},
date = {2012-11-21},
booktitle = {Proc. Iberspeech 2012},
abstract = {This paper presents several Spanish emotional TTS systems develop in Simple4All},
keywords = {Albayzin challenge, expressive speech synthesis}
}
This paper presents several Spanish emotional TTS systems develop in Simple4All
|
Jaime Lorenzo-Trueba, Roberto Barra-Chicote, Tuomo Raitio, Nicolas Obin, Paavo Alku, Junichi Yamagishi, Juan M Montero (2012): Towards Glottal Source Controllability in Expressive Speech Synthesis. In: Proc. Interspeech 2012, Portland (Oregon), USA, 2012, ISSN: 1990-9772. (Type: Inproceeding | Abstract | Links | BibTeX | Tags: expressive speech synthesis, glottal source modeling, speaking styles)@inproceedings{LorenzoStyleAndGlott2012,
title = {Towards Glottal Source Controllability in Expressive Speech Synthesis},
author = {Jaime Lorenzo-Trueba, Roberto Barra-Chicote, Tuomo Raitio, Nicolas Obin, Paavo Alku, Junichi Yamagishi, Juan M Montero},
url = {http://www-gth.die.upm.es/research/documentation/AG-112Tow-12.pdf},
issn = {1990-9772},
year = {2012},
date = {2012-09-13},
booktitle = {Proc. Interspeech 2012},
address = {Portland (Oregon), USA},
abstract = {In order to obtain more human like sounding human-machine interfaces we must first be able to give them expressive capabilities in the way of emotional and stylistic features so as to closely adequate them to the intended task. If we want to replicate those features it is not enough to merely replicate the prosodic information of fundamental frequency and speaking rhythm. The proposed additional layer is the modification of the glottal model, for which we make use of the GlottHMM parameters. This paper analyzes the viability of such an approach by verifying that the expressive nuances are captured by the aforementioned features, obtaining 95% recognition rates on styled speaking and 82% on emotional speech. Then we evaluate the effect of speaker bias and recording environment on the source modeling in order to quantify possible problems when analyzing multi-speaker databases. Finally we propose a speaking styles separation for Spanish based on prosodic features and check its perceptual significance.},
keywords = {expressive speech synthesis, glottal source modeling, speaking styles}
}
In order to obtain more human like sounding human-machine interfaces we must first be able to give them expressive capabilities in the way of emotional and stylistic features so as to closely adequate them to the intended task. If we want to replicate those features it is not enough to merely replicate the prosodic information of fundamental frequency and speaking rhythm. The proposed additional layer is the modification of the glottal model, for which we make use of the GlottHMM parameters. This paper analyzes the viability of such an approach by verifying that the expressive nuances are captured by the aforementioned features, obtaining 95% recognition rates on styled speaking and 82% on emotional speech. Then we evaluate the effect of speaker bias and recording environment on the source modeling in order to quantify possible problems when analyzing multi-speaker databases. Finally we propose a speaking styles separation for Spanish based on prosodic features and check its perceptual significance.
|
J. Lorenzo, B. Martinez, R. Barra-Chicote, V. Lopez–Ludena, J. Ferreiros, J. Yamagishi, J.M. Montero (2012): Towards an Unsupervised Speaking Style Voice Building Framework: Multi–Style Speaker Diarization. In: Proc. Interspeech 2012, 2012, ISSN: 1990-9772. (Type: Inproceeding | Abstract | Links | BibTeX | Tags: expressive speech synthesis, speaker diarization, speaking styles, voice cloning)@inproceedings{LorenzoStyleDiarization2012,
title = {Towards an Unsupervised Speaking Style Voice Building Framework: Multi–Style Speaker Diarization},
author = {J. Lorenzo, B. Martinez, R. Barra-Chicote, V. Lopez–Ludena, J. Ferreiros, J. Yamagishi, J.M. Montero},
url = {http://www-gth.die.upm.es/research/documentation/AG-113Tow-12.pdf},
issn = {1990-9772},
year = {2012},
date = {2012-09-12},
booktitle = {Proc. Interspeech 2012},
abstract = {Current text–to–speech systems are developed using studio-recorded speech in a neutral style or based on acted emotions. However, the proliferation of media sharing sites would allow developing a new generation of speech–based systems which could cope with spontaneous and styled speech. This paper proposes an architecture to deal with realistic recordings and carries out some experiments on unsupervised speaker diarization. In order to maximize the speaker purity of the clusters while keeping a high speaker coverage, the paper evaluates the F–measure of a diarization module, achieving high scores (>85%) especially when the clusters are longer than 30 seconds, even for the more spontaneous and expressive styles (such as talk shows or sports).},
keywords = {expressive speech synthesis, speaker diarization, speaking styles, voice cloning}
}
Current text–to–speech systems are developed using studio-recorded speech in a neutral style or based on acted emotions. However, the proliferation of media sharing sites would allow developing a new generation of speech–based systems which could cope with spontaneous and styled speech. This paper proposes an architecture to deal with realistic recordings and carries out some experiments on unsupervised speaker diarization. In order to maximize the speaker purity of the clusters while keeping a high speaker coverage, the paper evaluates the F–measure of a diarization module, achieving high scores (>85%) especially when the clusters are longer than 30 seconds, even for the more spontaneous and expressive styles (such as talk shows or sports).
|