georgian tokenizer and south azerbeijani
Browse files- data/Kartvelian.json +21 -3
- data/Turkic.json +40 -2
data/Kartvelian.json
CHANGED
|
@@ -29,16 +29,34 @@
|
|
| 29 |
"iso_3_code": "kat",
|
| 30 |
"children": [],
|
| 31 |
"family": "Kartvelian",
|
| 32 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"node_i": "4608",
|
| 34 |
-
"native_tokenizers": [
|
|
|
|
|
|
|
| 35 |
"scripts": [
|
| 36 |
"Geor"
|
| 37 |
]
|
| 38 |
}
|
| 39 |
],
|
| 40 |
"family": "Kartvelian",
|
| 41 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
"node_i": "4606",
|
| 43 |
"native_tokenizers": [],
|
| 44 |
"scripts": []
|
|
|
|
| 29 |
"iso_3_code": "kat",
|
| 30 |
"children": [],
|
| 31 |
"family": "Kartvelian",
|
| 32 |
+
"tokenizers": {
|
| 33 |
+
"Geor": {
|
| 34 |
+
"full_object": "GeorgianTokenizer()",
|
| 35 |
+
"original_lang_name": "georgian",
|
| 36 |
+
"original_lang_code": "kat",
|
| 37 |
+
"script": "Geor",
|
| 38 |
+
"class_name": "GeorgianTokenizer"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
"node_i": "4608",
|
| 42 |
+
"native_tokenizers": [
|
| 43 |
+
"Geor"
|
| 44 |
+
],
|
| 45 |
"scripts": [
|
| 46 |
"Geor"
|
| 47 |
]
|
| 48 |
}
|
| 49 |
],
|
| 50 |
"family": "Kartvelian",
|
| 51 |
+
"tokenizers": {
|
| 52 |
+
"Geor": {
|
| 53 |
+
"full_object": "GeorgianTokenizer()",
|
| 54 |
+
"original_lang_name": "georgian",
|
| 55 |
+
"original_lang_code": "kat",
|
| 56 |
+
"script": "Geor",
|
| 57 |
+
"class_name": "GeorgianTokenizer"
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
"node_i": "4606",
|
| 61 |
"native_tokenizers": [],
|
| 62 |
"scripts": []
|
data/Turkic.json
CHANGED
|
@@ -372,9 +372,19 @@
|
|
| 372 |
"iso_3_code": "azb",
|
| 373 |
"children": [],
|
| 374 |
"family": "Turkic",
|
| 375 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
"node_i": "10583",
|
| 377 |
-
"native_tokenizers": [
|
|
|
|
|
|
|
| 378 |
"scripts": [
|
| 379 |
"Arab"
|
| 380 |
]
|
|
@@ -407,6 +417,13 @@
|
|
| 407 |
],
|
| 408 |
"family": "Turkic",
|
| 409 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
"Latn": {
|
| 411 |
"full_object": "SpaCyTokenizer(\"az\")",
|
| 412 |
"original_lang_name": "azerbaijani",
|
|
@@ -525,6 +542,13 @@
|
|
| 525 |
"children": [],
|
| 526 |
"family": "Turkic",
|
| 527 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
"Latn": {
|
| 529 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 530 |
"original_lang_name": "turkish",
|
|
@@ -544,6 +568,13 @@
|
|
| 544 |
],
|
| 545 |
"family": "Turkic",
|
| 546 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
"Latn": {
|
| 548 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 549 |
"original_lang_name": "turkish",
|
|
@@ -559,6 +590,13 @@
|
|
| 559 |
],
|
| 560 |
"family": "Turkic",
|
| 561 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
"Latn": {
|
| 563 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 564 |
"original_lang_name": "turkish",
|
|
|
|
| 372 |
"iso_3_code": "azb",
|
| 373 |
"children": [],
|
| 374 |
"family": "Turkic",
|
| 375 |
+
"tokenizers": {
|
| 376 |
+
"Arab": {
|
| 377 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
| 378 |
+
"original_lang_name": "persian",
|
| 379 |
+
"original_lang_code": "azb",
|
| 380 |
+
"script": "Arab",
|
| 381 |
+
"class_name": "SpaCyTokenizer"
|
| 382 |
+
}
|
| 383 |
+
},
|
| 384 |
"node_i": "10583",
|
| 385 |
+
"native_tokenizers": [
|
| 386 |
+
"Arab"
|
| 387 |
+
],
|
| 388 |
"scripts": [
|
| 389 |
"Arab"
|
| 390 |
]
|
|
|
|
| 417 |
],
|
| 418 |
"family": "Turkic",
|
| 419 |
"tokenizers": {
|
| 420 |
+
"Arab": {
|
| 421 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
| 422 |
+
"original_lang_name": "persian",
|
| 423 |
+
"original_lang_code": "azb",
|
| 424 |
+
"script": "Arab",
|
| 425 |
+
"class_name": "SpaCyTokenizer"
|
| 426 |
+
},
|
| 427 |
"Latn": {
|
| 428 |
"full_object": "SpaCyTokenizer(\"az\")",
|
| 429 |
"original_lang_name": "azerbaijani",
|
|
|
|
| 542 |
"children": [],
|
| 543 |
"family": "Turkic",
|
| 544 |
"tokenizers": {
|
| 545 |
+
"Arab": {
|
| 546 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
| 547 |
+
"original_lang_name": "persian",
|
| 548 |
+
"original_lang_code": "azb",
|
| 549 |
+
"script": "Arab",
|
| 550 |
+
"class_name": "SpaCyTokenizer"
|
| 551 |
+
},
|
| 552 |
"Latn": {
|
| 553 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 554 |
"original_lang_name": "turkish",
|
|
|
|
| 568 |
],
|
| 569 |
"family": "Turkic",
|
| 570 |
"tokenizers": {
|
| 571 |
+
"Arab": {
|
| 572 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
| 573 |
+
"original_lang_name": "persian",
|
| 574 |
+
"original_lang_code": "azb",
|
| 575 |
+
"script": "Arab",
|
| 576 |
+
"class_name": "SpaCyTokenizer"
|
| 577 |
+
},
|
| 578 |
"Latn": {
|
| 579 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 580 |
"original_lang_name": "turkish",
|
|
|
|
| 590 |
],
|
| 591 |
"family": "Turkic",
|
| 592 |
"tokenizers": {
|
| 593 |
+
"Arab": {
|
| 594 |
+
"full_object": "SpaCyTokenizer(\"fa\")",
|
| 595 |
+
"original_lang_name": "persian",
|
| 596 |
+
"original_lang_code": "azb",
|
| 597 |
+
"script": "Arab",
|
| 598 |
+
"class_name": "SpaCyTokenizer"
|
| 599 |
+
},
|
| 600 |
"Latn": {
|
| 601 |
"full_object": "SpaCyTokenizer(\"tr\")",
|
| 602 |
"original_lang_name": "turkish",
|