|
108 | 108 | "operations": {
|
109 | 109 | "methods": {
|
110 | 110 | "cancel": {
|
111 |
| - "description": "Starts asynchronous cancellation on a long-running operation. The server makes a best effort to cancel the operation, but success is not guaranteed. If the server doesn't support this method, it returns `google.rpc.Code.UNIMPLEMENTED`. Clients can use Operations.GetOperation or other methods to check whether the cancellation succeeded or whether the operation completed despite cancellation. On successful cancellation, the operation is not deleted; instead, it becomes an operation with an Operation.error value with a google.rpc.Status.code of 1, corresponding to `Code.CANCELLED`.", |
| 111 | + "description": "Starts asynchronous cancellation on a long-running operation. The server makes a best effort to cancel the operation, but success is not guaranteed. If the server doesn't support this method, it returns `google.rpc.Code.UNIMPLEMENTED`. Clients can use Operations.GetOperation or other methods to check whether the cancellation succeeded or whether the operation completed despite cancellation. On successful cancellation, the operation is not deleted; instead, it becomes an operation with an Operation.error value with a google.rpc.Status.code of `1`, corresponding to `Code.CANCELLED`.", |
112 | 112 | "flatPath": "v1/operations/{operationsId}:cancel",
|
113 | 113 | "httpMethod": "POST",
|
114 | 114 | "id": "texttospeech.operations.cancel",
|
|
294 | 294 | },
|
295 | 295 | "voices": {
|
296 | 296 | "methods": {
|
297 |
| - "generateVoiceCloningKey": { |
298 |
| - "description": "Generates voice clone key given a short voice prompt. This method validates the voice prompts with a series of checks against the voice talent statement to verify the voice clone is safe to generate.", |
299 |
| - "flatPath": "v1/voices:generateVoiceCloningKey", |
300 |
| - "httpMethod": "POST", |
301 |
| - "id": "texttospeech.voices.generateVoiceCloningKey", |
302 |
| - "parameterOrder": [], |
303 |
| - "parameters": {}, |
304 |
| - "path": "v1/voices:generateVoiceCloningKey", |
305 |
| - "request": { |
306 |
| - "$ref": "GenerateVoiceCloningKeyRequest" |
307 |
| - }, |
308 |
| - "response": { |
309 |
| - "$ref": "GenerateVoiceCloningKeyResponse" |
310 |
| - }, |
311 |
| - "scopes": [ |
312 |
| - "https://www.googleapis.com/auth/cloud-platform" |
313 |
| - ] |
314 |
| - }, |
315 | 297 | "list": {
|
316 | 298 | "description": "Returns a list of Voice supported for synthesis.",
|
317 | 299 | "flatPath": "v1/voices",
|
|
336 | 318 | }
|
337 | 319 | }
|
338 | 320 | },
|
339 |
| - "revision": "20241001", |
| 321 | + "revision": "20250114", |
340 | 322 | "rootUrl": "https://texttospeech.googleapis.com/",
|
341 | 323 | "schemas": {
|
342 | 324 | "AdvancedVoiceOptions": {
|
343 | 325 | "description": "Used for advanced voice options.",
|
344 | 326 | "id": "AdvancedVoiceOptions",
|
345 | 327 | "properties": {
|
346 | 328 | "lowLatencyJourneySynthesis": {
|
347 |
| - "description": "Only for Jounrney voices. If false, the synthesis will be context aware and have higher latency.", |
| 329 | + "description": "Only for Journey voices. If false, the synthesis is context aware and has a higher latency.", |
348 | 330 | "type": "boolean"
|
349 | 331 | }
|
350 | 332 | },
|
|
362 | 344 | "MP3",
|
363 | 345 | "OGG_OPUS",
|
364 | 346 | "MULAW",
|
365 |
| - "ALAW" |
| 347 | + "ALAW", |
| 348 | + "PCM" |
366 | 349 | ],
|
367 | 350 | "enumDescriptions": [
|
368 | 351 | "Not specified. Will return result google.rpc.Code.INVALID_ARGUMENT.",
|
369 | 352 | "Uncompressed 16-bit signed little-endian samples (Linear PCM). Audio content returned as LINEAR16 also contains a WAV header.",
|
370 | 353 | "MP3 audio at 32kbps.",
|
371 |
| - "Opus encoded audio wrapped in an ogg container. The result will be a file which can be played natively on Android, and in browsers (at least Chrome and Firefox). The quality of the encoding is considerably higher than MP3 while using approximately the same bitrate.", |
| 354 | + "Opus encoded audio wrapped in an ogg container. The result is a file which can be played natively on Android, and in browsers (at least Chrome and Firefox). The quality of the encoding is considerably higher than MP3 while using approximately the same bitrate.", |
372 | 355 | "8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. Audio content returned as MULAW also contains a WAV header.",
|
373 |
| - "8-bit samples that compand 14-bit audio samples using G.711 PCMU/A-law. Audio content returned as ALAW also contains a WAV header." |
| 356 | + "8-bit samples that compand 14-bit audio samples using G.711 PCMU/A-law. Audio content returned as ALAW also contains a WAV header.", |
| 357 | + "Uncompressed 16-bit signed little-endian samples (Linear PCM). Note that as opposed to LINEAR16, audio won't be wrapped in a WAV (or any other) header." |
374 | 358 | ],
|
375 | 359 | "type": "string"
|
376 | 360 | },
|
|
423 | 407 | ],
|
424 | 408 | "enumDescriptions": [
|
425 | 409 | "Not specified.",
|
426 |
| - "IPA. (e.g. apple -> ˈæpəl ) https://en.wikipedia.org/wiki/International_Phonetic_Alphabet", |
427 |
| - "X-SAMPA (e.g. apple -> \"{p@l\" ) https://en.wikipedia.org/wiki/X-SAMPA" |
| 410 | + "IPA, such as apple -> ˈæpəl. https://en.wikipedia.org/wiki/International_Phonetic_Alphabet", |
| 411 | + "X-SAMPA, such as apple -> \"{p@l\". https://en.wikipedia.org/wiki/X-SAMPA" |
428 | 412 | ],
|
429 | 413 | "type": "string"
|
430 | 414 | },
|
431 | 415 | "phrase": {
|
432 |
| - "description": "The phrase to which the customization will be applied. The phrase can be multiple words (in the case of proper nouns etc), but should not span to a whole sentence.", |
| 416 | + "description": "The phrase to which the customization is applied. The phrase can be multiple words, such as proper nouns, but shouldn't span the length of the sentence.", |
433 | 417 | "type": "string"
|
434 | 418 | },
|
435 | 419 | "pronunciation": {
|
|
444 | 428 | "id": "CustomPronunciations",
|
445 | 429 | "properties": {
|
446 | 430 | "pronunciations": {
|
447 |
| - "description": "The pronunciation customizations to be applied.", |
| 431 | + "description": "The pronunciation customizations are applied.", |
448 | 432 | "items": {
|
449 | 433 | "$ref": "CustomPronunciationParams"
|
450 | 434 | },
|
|
485 | 469 | "properties": {},
|
486 | 470 | "type": "object"
|
487 | 471 | },
|
488 |
| - "GenerateVoiceCloningKeyRequest": { |
489 |
| - "description": "Request message for the `GenerateVoiceCloningKey` method.", |
490 |
| - "id": "GenerateVoiceCloningKeyRequest", |
491 |
| - "properties": { |
492 |
| - "consentScript": { |
493 |
| - "description": "Required. The script used for the voice talent statement. The script will be provided to the caller through other channels. It must be returned unchanged in this field.", |
494 |
| - "type": "string" |
495 |
| - }, |
496 |
| - "languageCode": { |
497 |
| - "description": "Required. The language of the supplied audio as a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example: \"en-US\". See [Language Support](https://cloud.google.com/speech-to-text/docs/languages) for a list of the currently supported language codes.", |
498 |
| - "type": "string" |
499 |
| - }, |
500 |
| - "referenceAudio": { |
501 |
| - "$ref": "InputAudio", |
502 |
| - "description": "Required. The training audio used to create voice clone. This is currently limited to LINEAR16 PCM WAV files mono audio with 24khz sample rate. This needs to be specified in [InputAudio.audio_config], other values will be explicitly rejected." |
503 |
| - }, |
504 |
| - "voiceTalentConsent": { |
505 |
| - "$ref": "InputAudio", |
506 |
| - "description": "Required. The voice talent audio used to verify consent to voice clone." |
507 |
| - } |
508 |
| - }, |
509 |
| - "type": "object" |
510 |
| - }, |
511 |
| - "GenerateVoiceCloningKeyResponse": { |
512 |
| - "description": "Response message for the `GenerateVoiceCloningKey` method.", |
513 |
| - "id": "GenerateVoiceCloningKeyResponse", |
514 |
| - "properties": { |
515 |
| - "voiceCloningKey": { |
516 |
| - "description": "The voice clone key. Use it in the SynthesizeSpeechRequest by setting [voice.voice_clone.voice_cloning_key].", |
517 |
| - "type": "string" |
518 |
| - } |
519 |
| - }, |
520 |
| - "type": "object" |
521 |
| - }, |
522 | 472 | "GoogleCloudTexttospeechV1SynthesizeLongAudioMetadata": {
|
523 | 473 | "description": "Metadata for response returned by the `SynthesizeLongAudio` method.",
|
524 | 474 | "id": "GoogleCloudTexttospeechV1SynthesizeLongAudioMetadata",
|
|
542 | 492 | },
|
543 | 493 | "type": "object"
|
544 | 494 | },
|
545 |
| - "InputAudio": { |
546 |
| - "description": "Holds audio content and config.", |
547 |
| - "id": "InputAudio", |
548 |
| - "properties": { |
549 |
| - "audioConfig": { |
550 |
| - "$ref": "InputAudioConfig", |
551 |
| - "description": "Required. Provides information that specifies how to process content." |
552 |
| - }, |
553 |
| - "content": { |
554 |
| - "description": "Required. The audio data bytes encoded as specified in `InputAudioConfig`. Note: as with all bytes fields, proto buffers use a pure binary representation, whereas JSON representations use base64. Audio samples should be between 5-25 seconds in length.", |
555 |
| - "format": "byte", |
556 |
| - "type": "string" |
557 |
| - } |
558 |
| - }, |
559 |
| - "type": "object" |
560 |
| - }, |
561 |
| - "InputAudioConfig": { |
562 |
| - "description": "Description of inputted audio data.", |
563 |
| - "id": "InputAudioConfig", |
564 |
| - "properties": { |
565 |
| - "audioEncoding": { |
566 |
| - "description": "Required. The format of the audio byte stream.", |
567 |
| - "enum": [ |
568 |
| - "AUDIO_ENCODING_UNSPECIFIED", |
569 |
| - "LINEAR16", |
570 |
| - "MP3", |
571 |
| - "OGG_OPUS", |
572 |
| - "MULAW", |
573 |
| - "ALAW" |
574 |
| - ], |
575 |
| - "enumDescriptions": [ |
576 |
| - "Not specified. Will return result google.rpc.Code.INVALID_ARGUMENT.", |
577 |
| - "Uncompressed 16-bit signed little-endian samples (Linear PCM). Audio content returned as LINEAR16 also contains a WAV header.", |
578 |
| - "MP3 audio at 32kbps.", |
579 |
| - "Opus encoded audio wrapped in an ogg container. The result will be a file which can be played natively on Android, and in browsers (at least Chrome and Firefox). The quality of the encoding is considerably higher than MP3 while using approximately the same bitrate.", |
580 |
| - "8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. Audio content returned as MULAW also contains a WAV header.", |
581 |
| - "8-bit samples that compand 14-bit audio samples using G.711 PCMU/A-law. Audio content returned as ALAW also contains a WAV header." |
582 |
| - ], |
583 |
| - "type": "string" |
584 |
| - }, |
585 |
| - "sampleRateHertz": { |
586 |
| - "description": "Required. The sample rate (in hertz) for this audio.", |
587 |
| - "format": "int32", |
588 |
| - "type": "integer" |
589 |
| - } |
590 |
| - }, |
591 |
| - "type": "object" |
592 |
| - }, |
593 | 495 | "ListOperationsResponse": {
|
594 | 496 | "description": "The response message for Operations.ListOperations.",
|
595 | 497 | "id": "ListOperationsResponse",
|
|
622 | 524 | },
|
623 | 525 | "type": "object"
|
624 | 526 | },
|
| 527 | + "MultiSpeakerMarkup": { |
| 528 | + "description": "A collection of turns for multi-speaker synthesis.", |
| 529 | + "id": "MultiSpeakerMarkup", |
| 530 | + "properties": { |
| 531 | + "turns": { |
| 532 | + "description": "Required. Speaker turns.", |
| 533 | + "items": { |
| 534 | + "$ref": "Turn" |
| 535 | + }, |
| 536 | + "type": "array" |
| 537 | + } |
| 538 | + }, |
| 539 | + "type": "object" |
| 540 | + }, |
625 | 541 | "Operation": {
|
626 | 542 | "description": "This resource represents a long-running operation that is the result of a network API call.",
|
627 | 543 | "id": "Operation",
|
|
690 | 606 | "properties": {
|
691 | 607 | "customPronunciations": {
|
692 | 608 | "$ref": "CustomPronunciations",
|
693 |
| - "description": "Optional. The pronunciation customizations to be applied to the input. If this is set, the input will be synthesized using the given pronunciation customizations. The initial support will be for EFIGS (English, French, Italian, German, Spanish) languages, as provided in VoiceSelectionParams. Journey and Instant Clone voices are not supported yet. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag (entirely or partially)." |
| 609 | + "description": "Optional. The pronunciation customizations are applied to the input. If this is set, the input is synthesized using the given pronunciation customizations. The initial support is for English, French, Italian, German, and Spanish (EFIGS) languages, as provided in VoiceSelectionParams. Journey and Instant Clone voices aren't supported. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag." |
| 610 | + }, |
| 611 | + "multiSpeakerMarkup": { |
| 612 | + "$ref": "MultiSpeakerMarkup", |
| 613 | + "description": "The multi-speaker input to be synthesized. Only applicable for multi-speaker synthesis." |
694 | 614 | },
|
695 | 615 | "ssml": {
|
696 | 616 | "description": "The SSML document to be synthesized. The SSML document must be valid and well-formed. Otherwise the RPC will fail and return google.rpc.Code.INVALID_ARGUMENT. For more information, see [SSML](https://cloud.google.com/text-to-speech/docs/ssml).",
|
|
755 | 675 | "properties": {
|
756 | 676 | "advancedVoiceOptions": {
|
757 | 677 | "$ref": "AdvancedVoiceOptions",
|
758 |
| - "description": "Adnanced voice options." |
| 678 | + "description": "Advanced voice options." |
759 | 679 | },
|
760 | 680 | "audioConfig": {
|
761 | 681 | "$ref": "AudioConfig",
|
|
784 | 704 | },
|
785 | 705 | "type": "object"
|
786 | 706 | },
|
| 707 | + "Turn": { |
| 708 | + "description": "A multi-speaker turn.", |
| 709 | + "id": "Turn", |
| 710 | + "properties": { |
| 711 | + "speaker": { |
| 712 | + "description": "Required. The speaker of the turn, for example, 'O' or 'Q'. Please refer to documentation for available speakers.", |
| 713 | + "type": "string" |
| 714 | + }, |
| 715 | + "text": { |
| 716 | + "description": "Required. The text to speak.", |
| 717 | + "type": "string" |
| 718 | + } |
| 719 | + }, |
| 720 | + "type": "object" |
| 721 | + }, |
787 | 722 | "Voice": {
|
788 | 723 | "description": "Description of a voice supported by the TTS service.",
|
789 | 724 | "id": "Voice",
|
|
868 | 803 | },
|
869 | 804 | "voiceClone": {
|
870 | 805 | "$ref": "VoiceCloneParams",
|
871 |
| - "description": "Optional. The configuration for a voice clone. If [VoiceCloneParams.voice_clone_key] is set, the service will choose the voice clone matching the specified configuration." |
| 806 | + "description": "Optional. The configuration for a voice clone. If [VoiceCloneParams.voice_clone_key] is set, the service chooses the voice clone matching the specified configuration." |
872 | 807 | }
|
873 | 808 | },
|
874 | 809 | "type": "object"
|
|
0 commit comments