This example illustrates how to score your audios according to MOS scale task with Crowdom.
You may want to first study image classification example because it contains more detailed comments of overall process.
%pip install crowdom
from datetime import timedelta
import os
import pandas as pd
import json
from typing import Dict
import toloka.client as toloka
from crowdom import base, datasource, classification, classification_loop, client, control, mos, objects, pricing, params as labeling_params, worker
import yaml
import logging.config
with open('logging.yaml') as f:
logging.config.dictConfig(yaml.full_load(f.read()))
toloka_client = client.create_toloka_client(token=os.getenv('TOLOKA_TOKEN') or input('Enter your token: '))
class MOS(base.ScoreEvaluation):
BAD = '1'
POOR = '2'
FAIR = '3'
GOOD = '4'
EXCELLENT = '5'
@classmethod
def labels(cls) -> Dict['MOS', Dict[str, str]]:
return {
cls.EXCELLENT: {'EN': 'Completely natural', 'RU': 'Абсолютно естественно'},
cls.GOOD: {'EN': 'Mostly natural', 'RU': 'В основном естественно'},
cls.FAIR: {'EN': 'Equally natural and unnatural', 'RU': 'В одинаковой степени естественно и неестественно'},
cls.POOR: {'EN': 'Mostly unnatural', 'RU': 'В основном неестественно'},
cls.BAD: {'EN': 'Completely unnatural', 'RU': 'Абсолютно неестественно'},
}
lang = 'EN'
function = base.ClassificationFunction(inputs=(objects.Audio,), cls=MOS)
example_url = 'https://storage.yandexcloud.net/crowdom-public/examples/mos/data/00fa8c05-e960-4088-bdc8-7e37b5416e9b.wav'
client.TaskPreview((objects.Audio(url=example_url),), task_function=function, lang=lang).display_link()
from markdown2 import Markdown as _Markdown
If your markdown instruction includes some complex objects, e.g. tables, there might be need for some text processing, so that it would be compatible with Toloka instruction format.
class Markdown(_Markdown):
def postprocess(self, text: str) -> str:
for align in ['left', 'right', 'center']:
text = text.replace(f'"text-align:{align};"', f'"text-align:{align}"')
return text
instruction = {}
for worker_lang in ['EN', 'RU']:
with open(f'instruction_{worker_lang}.md') as f:
instruction[worker_lang] = Markdown(extras=["tables"]).convert(f.read())
name = {'EN': 'Speech quality evaluation', 'RU': 'Оценка качества речи'}
description = {
'EN': 'Rate the quality of audio files on a scale of 1 to 5 (90 seconds to complete)',
'RU': 'Оцените качество аудиозаписи по шкале от 1 до 5 (можно выполнить за 90 секунд)',
}
task_spec = base.TaskSpec(
id='mos',
function=function,
name=name,
description=description,
instruction=instruction,
)
task_spec_en = client.PreparedTaskSpec(task_spec, lang)
client.define_task(task_spec_en, toloka_client)
task_duration_hint = timedelta(seconds=9) # audios are about 7-9 seconds each
In the simplest case, MOS labeling is run on audios from the same source.
But alternatively MOS audio labeling can be run for multiple data sources - we can compare different synthesis models with each other, or a synthesis model with its source data speaker.
In the case, when audios from multiple sources are labeled simultaneously, each of the data sources must have te same set of texts being spoken on the audios from it. Also, you have to provide dict with metadata to enable algorithms to distinguish audios form different sources from each other and to determine the text from the audio.
objects_metadata = {}
speaker_objects = datasource.read_tasks('speaker.json', task_spec_en.task_mapping)
with open('speaker.json') as file:
for entry, input_objects in zip(json.load(file), speaker_objects):
objects_metadata[input_objects] = mos.ObjectsMetadata(item_id=entry['text'], algorithm='speaker')
synthesis_objects = datasource.read_tasks('synthesis.json', task_spec_en.task_mapping)
with open('synthesis.json') as file:
for entry, input_objects in zip(json.load(file), synthesis_objects):
objects_metadata[input_objects] = mos.ObjectsMetadata(item_id=entry['text'], algorithm='synthesis')
input_objects = speaker_objects + synthesis_objects
For MOS, we define labeling parameters with code, because interactive parameters form unaware of some MOS pipeline specifics.
pricing_config = pricing.PoolPricingConfig(
assignment_price=0.02,
real_tasks_count=10,
control_tasks_count=0,
)
params = client.Params(
task_duration_hint=task_duration_hint,
pricing_config=pricing_config,
overlap=classification_loop.StaticOverlap(3),
aggregation_algorithm=classification.AggregationAlgorithm.MAJORITY_VOTE,
control=control.Control(
rules=control.RuleBuilder().add_static_reward(0.5).add_complex_speed_control(
[control.BlockTimePicker(0.1, '2d', True)]).build(),
),
worker_filter=worker.WorkerFilter(
filters=[
worker.WorkerFilter.Params(
langs={worker.LanguageRequirement(lang=lang)},
regions=worker.lang_to_default_regions.get(lang, {}),
age_range=(18, None),
),
],
training_score=None,
),
)
artifacts = client.launch_mos(
task_spec_en,
params,
input_objects,
toloka_client,
interactive=True,
inputs_to_metadata=objects_metadata,
)
clear formula, which does not account edge cases like min commission and incomplete assignments
more precise formula, which accounts more edge cases
run classification of 10 objects for 0.05$? [Y/n] Y
Output()
2022-08-29 14:38:20,359 - crowdom.client.launch:_launch:187 - INFO: - classification has started
results = artifacts.results
artifacts.ci
{'speaker': MOSCI(mu=4.35, ci=0.67), 'synthesis': MOSCI(mu=4.21, ci=0.59)}