import pytest
from lingua import (
ConfidenceValue,
Language,
LanguageDetectorBuilder
)
detector_for_english_and_german = (
LanguageDetectorBuilder.from_languages(
Language.ENGLISH, Language.GERMAN)
.with_preloaded_language_models()
.build()
)
detector_for_all_languages = (
LanguageDetectorBuilder.from_all_languages()
.with_preloaded_language_models()
.build()
)
def test_detect_language():
assert (
detector_for_english_and_german
.detect_language_of("Sprachen sind großartig")
== Language.GERMAN
)
def test_detect_languages_in_parallel():
assert (
detector_for_english_and_german
.detect_languages_in_parallel_of([
"languages are awesome",
"Sprachen sind großartig"
])
== [Language.ENGLISH, Language.GERMAN]
)
def test_no_language_is_returned():
assert (
detector_for_english_and_german
.detect_language_of("проарплап")
is None
)
def test_detect_multiple_languages_for_empty_string():
assert (
detector_for_english_and_german
.detect_multiple_languages_of("")
== []
)
@pytest.mark.parametrize(
"sentence,expected_word_count,expected_language",
[
pytest.param(
"I'm really not sure whether multi-language detection is a good idea.",
11,
Language.ENGLISH,
id="ENGLISH 1"
),
pytest.param("I'm frightened! 🙈", 3, Language.ENGLISH, id="ENGLISH 2"),
pytest.param("V төзімділік спорт", 3, Language.KAZAKH, id="KAZAKH"),
],
)
def test_detect_multiple_languages_with_one_language(
sentence, expected_word_count, expected_language
):
results = detector_for_all_languages.detect_multiple_languages_of(sentence)
assert len(results) == 1
result = results[0]
substring = sentence[result.start_index : result.end_index]
assert substring == sentence
assert result.word_count == expected_word_count
assert result.language == expected_language
@pytest.mark.parametrize(
",".join(
[
"sentence",
"expected_first_substring",
"expected_first_word_count",
"expected_first_language",
"expected_second_substring",
"expected_second_word_count",
"expected_second_language",
]
),
[
pytest.param(
' He turned around and asked: "Entschuldigen Sie, sprechen Sie Deutsch?"',
" He turned around and asked: ",
5,
Language.ENGLISH,
'"Entschuldigen Sie, sprechen Sie Deutsch?"',
5,
Language.GERMAN,
id="ENGLISH,GERMAN"
),
pytest.param(
"上海大学是一个好大学. It is such a great university.",
"上海大学是一个好大学. ",
10,
Language.CHINESE,
"It is such a great university.",
6,
Language.ENGLISH,
id="CHINESE,ENGLISH"
),
pytest.param(
"English German French - Английский язык",
"English German French - ",
4,
Language.ENGLISH,
"Английский язык",
2,
Language.RUSSIAN,
id="ENGLISH,RUSSIAN"
),
],
)
def test_detect_multiple_languages_with_two_languages(
sentence,
expected_first_substring,
expected_first_word_count,
expected_first_language,
expected_second_substring,
expected_second_word_count,
expected_second_language,
):
results = detector_for_all_languages.detect_multiple_languages_of(sentence)
assert len(results) == 2
first_result = results[0]
first_substring = sentence[first_result.start_index : first_result.end_index]
assert first_substring == expected_first_substring
assert first_result.word_count == expected_first_word_count
assert first_result.language == expected_first_language
second_result = results[1]
second_substring = sentence[second_result.start_index : second_result.end_index]
assert second_substring == expected_second_substring
assert second_result.word_count == expected_second_word_count
assert second_result.language == expected_second_language
@pytest.mark.parametrize(
",".join(
[
"sentence",
"expected_first_substring",
"expected_first_word_count",
"expected_first_language",
"expected_second_substring",
"expected_second_word_count",
"expected_second_language",
"expected_third_substring",
"expected_third_word_count",
"expected_third_language",
]
),
[
pytest.param(
"Parlez-vous français? Ich spreche Französisch nur ein bisschen. A little bit is better than nothing.",
"Parlez-vous français? ",
2,
Language.FRENCH,
"Ich spreche Französisch nur ein bisschen. ",
6,
Language.GERMAN,
"A little bit is better than nothing.",
7,
Language.ENGLISH,
id="FRENCH,GERMAN,ENGLISH"
),
pytest.param(
"Płaszczowo-rurowe wymienniki ciepła Uszczelkowe der blaue himmel über berlin 中文 the quick brown fox jumps over the lazy dog",
"Płaszczowo-rurowe wymienniki ciepła Uszczelkowe ",
4,
Language.POLISH,
"der blaue himmel über berlin 中文 ",
7,
Language.GERMAN,
"the quick brown fox jumps over the lazy dog",
9,
Language.ENGLISH,
id="POLISH,GERMAN,ENGLISH"
),
],
)
def test_detect_multiple_languages_with_three_languages(
sentence,
expected_first_substring,
expected_first_word_count,
expected_first_language,
expected_second_substring,
expected_second_word_count,
expected_second_language,
expected_third_substring,
expected_third_word_count,
expected_third_language,
):
results = detector_for_all_languages.detect_multiple_languages_of(sentence)
assert len(results) == 3
first_result = results[0]
first_substring = sentence[first_result.start_index : first_result.end_index]
assert first_substring == expected_first_substring
assert first_result.word_count == expected_first_word_count
assert first_result.language == expected_first_language
second_result = results[1]
second_substring = sentence[second_result.start_index : second_result.end_index]
assert second_substring == expected_second_substring
assert second_result.word_count == expected_second_word_count
assert second_result.language == expected_second_language
third_result = results[2]
third_substring = sentence[third_result.start_index : third_result.end_index]
assert third_substring == expected_third_substring
assert third_result.word_count == expected_third_word_count
assert third_result.language == expected_third_language
@pytest.mark.parametrize(
"text,expected_confidence_values",
[
pytest.param(
"groß",
[
ConfidenceValue(Language.GERMAN, 1.0),
ConfidenceValue(Language.ENGLISH, 0.0),
],
),
pytest.param(
"Alter",
[
ConfidenceValue(Language.GERMAN, 0.68),
ConfidenceValue(Language.ENGLISH, 0.32),
],
),
pytest.param(
"проарплап",
[
ConfidenceValue(Language.ENGLISH, 0.0),
ConfidenceValue(Language.GERMAN, 0.0),
],
),
],
)
def test_compute_language_confidence_values(
text, expected_confidence_values
):
confidence_values = (
detector_for_english_and_german.compute_language_confidence_values(
text
)
)
assert len(confidence_values) == 2
first, second = confidence_values
expected_first, expected_second = expected_confidence_values
assert first.language == expected_first.language
assert round(first.value, 2) == expected_first.value
assert second.language == expected_second.language
assert round(second.value, 2) == expected_second.value
@pytest.mark.parametrize(
"texts,expected_confidence_values",
[
pytest.param(
["groß", "Alter", "проарплап"],
[
[
ConfidenceValue(Language.GERMAN, 1.0),
ConfidenceValue(Language.ENGLISH, 0.0),
],
[
ConfidenceValue(Language.GERMAN, 0.68),
ConfidenceValue(Language.ENGLISH, 0.32),
],
[
ConfidenceValue(Language.ENGLISH, 0.0),
ConfidenceValue(Language.GERMAN, 0.0),
],
]
)
]
)
def test_compute_language_confidence_values_in_parallel(
texts, expected_confidence_values
):
confidence_values = (
detector_for_english_and_german
.compute_language_confidence_values_in_parallel(texts)
)
assert len(confidence_values) == 3
assert len(confidence_values[0]) == 2
assert len(confidence_values[1]) == 2
assert len(confidence_values[2]) == 2
for i, values in enumerate(confidence_values):
first, second = values
expected_first, expected_second = expected_confidence_values[i]
assert first.language == expected_first.language
assert round(first.value, 2) == expected_first.value
assert second.language == expected_second.language
assert round(second.value, 2) == expected_second.value
@pytest.mark.parametrize(
"text,expected_confidence_for_german,expected_confidence_for_english",
[
pytest.param("groß", 1.0, 0.0),
pytest.param("Alter", 0.68, 0.32),
pytest.param("проарплап", 0.0, 0.0),
],
)
def test_compute_language_confidence(
text,
expected_confidence_for_german,
expected_confidence_for_english,
):
confidence_for_german = (
detector_for_english_and_german.compute_language_confidence(
text, Language.GERMAN
)
)
assert round(confidence_for_german, 2) == expected_confidence_for_german
confidence_for_english = (
detector_for_english_and_german.compute_language_confidence(
text, Language.ENGLISH
)
)
assert round(confidence_for_english, 2) == expected_confidence_for_english
confidence_for_french = (
detector_for_english_and_german.compute_language_confidence(
text, Language.FRENCH
)
)
assert confidence_for_french == 0.0
@pytest.mark.parametrize(
"texts,expected_confidence_values_for_german,expected_confidence_values_for_english",
[
pytest.param(
["groß", "Alter", "проарплап"],
[1.0, 0.68, 0.0],
[0.0, 0.32, 0.0]
)
]
)
def test_compute_language_confidence_in_parallel(
texts,
expected_confidence_values_for_german,
expected_confidence_values_for_english
):
confidence_values_for_german = (
detector_for_english_and_german
.compute_language_confidence_in_parallel(texts, Language.GERMAN)
)
rounded_values_for_german = list(map(
lambda v: round(v, 2),
confidence_values_for_german
))
assert rounded_values_for_german == expected_confidence_values_for_german
confidence_values_for_english = (
detector_for_english_and_german
.compute_language_confidence_in_parallel(texts, Language.ENGLISH)
)
rounded_values_for_english = list(map(
lambda v: round(v, 2),
confidence_values_for_english
))
assert rounded_values_for_english == expected_confidence_values_for_english