mirror of
https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git
synced 2025-04-05 04:08:58 +08:00
188 lines
5.1 KiB
Python
188 lines
5.1 KiB
Python
TASK_TOKEN_MAP = {
|
|
"vc": "<|task_vc|>",
|
|
"tts": "<|task_tts|>",
|
|
"asr": "<|task_asr|>",
|
|
"s2s": "<|task_s2s|>",
|
|
"t2s": "<|task_t2s|>",
|
|
"understand": "<|task_understand|>",
|
|
"caption": "<|task_cap|>",
|
|
"controllable_tts": "<|task_controllable_tts|>",
|
|
"prompt_tts": "<|task_prompt_tts|>",
|
|
"speech_edit": "<|task_edit|>",
|
|
}
|
|
|
|
LEVELS_MAP = {
|
|
"very_low": 0,
|
|
"low": 1,
|
|
"moderate": 2,
|
|
"high": 3,
|
|
"very_high": 4,
|
|
}
|
|
|
|
LEVELS_MAP_UI = {
|
|
1: 'very_low',
|
|
2: 'low',
|
|
3: 'moderate',
|
|
4: 'high',
|
|
5: 'very_high'
|
|
}
|
|
|
|
GENDER_MAP = {
|
|
"female": 0,
|
|
"male": 1,
|
|
}
|
|
|
|
AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
|
|
|
|
EMO_MAP = {
|
|
"UNKNOWN": 0,
|
|
"NEUTRAL": 1,
|
|
"ANGRY": 2,
|
|
"HAPPY": 3,
|
|
"SAD": 4,
|
|
"FEARFUL": 5,
|
|
"DISGUSTED": 6,
|
|
"SURPRISED": 7,
|
|
"SARCASTIC": 8,
|
|
"EXCITED": 9,
|
|
"SLEEPY": 10,
|
|
"CONFUSED": 11,
|
|
"EMPHASIS": 12,
|
|
"LAUGHING": 13,
|
|
"SINGING": 14,
|
|
"WORRIED": 15,
|
|
"WHISPER": 16,
|
|
"ANXIOUS": 17,
|
|
"NO-AGREEMENT": 18,
|
|
"APOLOGETIC": 19,
|
|
"CONCERNED": 20,
|
|
"ENUNCIATED": 21,
|
|
"ASSERTIVE": 22,
|
|
"ENCOURAGING": 23,
|
|
"CONTEMPT": 24,
|
|
}
|
|
|
|
|
|
class TokenParser:
|
|
"""Turn label to special token"""
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
"""Parse the attributes of a person."""
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
@staticmethod
|
|
def age(age: str) -> str:
|
|
"""Turn age token."""
|
|
age_id = AGE_MAP[age]
|
|
return f"<|age_{age_id}|>"
|
|
|
|
@staticmethod
|
|
def gender(gender: str) -> str:
|
|
"""Turn gender token."""
|
|
gender_id = GENDER_MAP[gender]
|
|
return f"<|gender_{gender_id}|>"
|
|
|
|
@staticmethod
|
|
def mel_value(mel: int):
|
|
"""Turn special token of mel scale pitch."""
|
|
mel = max(0, int(mel))
|
|
mel = min(1000, int(mel))
|
|
return f"<|pitch_value_{mel}|>"
|
|
|
|
@staticmethod
|
|
def mel_level(level: str):
|
|
"""Turn special token of mel level."""
|
|
level_tag = LEVELS_MAP[level]
|
|
return f"<|pitch_label_{level_tag}|>"
|
|
|
|
@staticmethod
|
|
def pitch_var_value(pitch_std: int):
|
|
"""Turn special token of pitch_std value."""
|
|
assert isinstance(pitch_std, int)
|
|
pitch_std = max(0, int(pitch_std))
|
|
pitch_std = min(10, int(pitch_std))
|
|
return f"<|pitch_var_value_{pitch_std}|>"
|
|
|
|
@staticmethod
|
|
def pitch_var_level(level: str):
|
|
"""Turn special token of pitch std level."""
|
|
level_tag = LEVELS_MAP[level]
|
|
return f"<|pitch_var_label_{level_tag}|>"
|
|
|
|
@staticmethod
|
|
def loudness_value(loudness: int):
|
|
"""Turn special toak of loudness value [0, 30]"""
|
|
assert loudness >= 0
|
|
loudness = max(0, int(loudness))
|
|
loudness = min(30, int(loudness))
|
|
return f"<|loudness_value_{loudness}|>"
|
|
|
|
@staticmethod
|
|
def loudness_level(level: str):
|
|
"""Turn special token of loudness level."""
|
|
level_tag = LEVELS_MAP[level]
|
|
return f"<|loudness_label_{level_tag}|>"
|
|
|
|
@staticmethod
|
|
def speed_value(speed: int):
|
|
"""Turn special token of speed value."""
|
|
speed = max(0, int(speed))
|
|
speed = min(10, int(speed))
|
|
return f"<|speed_value_{speed}|>"
|
|
|
|
@staticmethod
|
|
def speed_level(level: str):
|
|
"""Turn special token of speed level."""
|
|
level_tag = LEVELS_MAP[level]
|
|
return f"<|speed_label_{level_tag}|>"
|
|
|
|
@staticmethod
|
|
def task(task: str) -> str:
|
|
"""Turn special token of task."""
|
|
assert task in TASK_TOKEN_MAP.keys()
|
|
|
|
return TASK_TOKEN_MAP[task]
|
|
|
|
@staticmethod
|
|
def emotion(emotion: str):
|
|
emo_id = EMO_MAP[emotion]
|
|
|
|
return f"<|emotion_{emo_id}|>"
|
|
|
|
|
|
# test
|
|
if __name__ == "__main__":
|
|
from transformers import AutoTokenizer
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
"/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
|
|
)
|
|
|
|
tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
|
|
ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
|
|
genders = ["female", "female", "female", "male", "male"]
|
|
mels = [100, 200, 300, 400, 500]
|
|
mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
|
|
loudnesses = [1, 10, 23, 19, 30]
|
|
loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
|
|
emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
|
|
|
|
for i in range(5):
|
|
task = TokenParser.task(tasks[i])
|
|
age = TokenParser.age(ages[i])
|
|
gender = TokenParser.gender(genders[i])
|
|
mel = TokenParser.mel_value(mels[i])
|
|
mel_level = TokenParser.mel_level(mel_levels[i])
|
|
loudness = TokenParser.loudness_value(loudnesses[i])
|
|
loudness_level = TokenParser.loudness_level(loudness_levels[i])
|
|
emotion = TokenParser.emotion(emotions[i])
|
|
inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
|
|
inputs = "".join(inputs)
|
|
ids = tokenizer.encode(inputs, add_special_tokens=False)
|
|
print(ids)
|
|
print("decode", tokenizer.decode(ids))
|