bilibili-backup/library/text/translate/chinese/opencc.go
2019-04-22 02:59:20 +00:00

182 lines
4.0 KiB
Go

package chinese
import (
"context"
"encoding/json"
"fmt"
"strings"
"go-common/library/log"
)
var (
defaultConversion = "s2twp"
)
// Group holds a sequence of dicts
type Group struct {
Files []string
Dicts []*dict
}
func (g *Group) String() string {
return fmt.Sprintf("%+v", g.Files)
}
// OpenCC contains the converter
type openCC struct {
Conversion string
Description string
DictGroup []*Group
}
var conversions = map[string]*openCC{
"s2twp": {Conversion: s2twp},
// "hk2s": {Conversion: hk2s}, "s2hk": {Conversion: s2hk}, "s2t": {Conversion: s2t},
// "s2tw": {Conversion: s2tw}, "t2hk": {Conversion: t2hk},
// "t2s": {Conversion: t2s}, "t2tw": {Conversion: t2tw},
// "tw2s": {Conversion: tw2s}, "tw2sp": {Conversion: tw2sp},
}
// Init construct an instance of OpenCC.
func Init() {
for k, v := range conversions {
if err := v.dict(k); err != nil {
panic(err)
}
}
}
// Converts .
func Converts(ctx context.Context, in ...string) (out map[string]string) {
var err error
out = make(map[string]string, len(in))
for _, v := range in {
if out[v], err = convert(v, defaultConversion); err != nil {
log.Error("convert(%s),err:%+v", in, err)
out[v] = v
}
}
return
}
// Convert string from Simplified Chinese to Traditional Chinese .
func Convert(ctx context.Context, in string) (out string) {
var err error
if out, err = convert(in, defaultConversion); err != nil {
log.Error("convert(%s),err:%+v", in, err)
}
return
}
func (cc *openCC) dict(conversion string) error {
var m interface{}
json.Unmarshal([]byte(cc.Conversion), &m)
config := m.(map[string]interface{})
cc.Description = config["name"].(string)
dictChain, ok := config["conversion_chain"].([]interface{})
if !ok {
return fmt.Errorf("format %+v not correct", config)
}
for _, v := range dictChain {
d, ok := v.(map[string]interface{})
if !ok {
return fmt.Errorf("should be map inside conversion_chain")
}
dictMap, ok := d["dict"]
if !ok {
return fmt.Errorf("should have dict inside conversion_chain")
}
if dict, ok := dictMap.(map[string]interface{}); ok {
group, err := cc.group(dict)
if err != nil {
return err
}
cc.DictGroup = append(cc.DictGroup, group)
}
}
return nil
}
func (cc *openCC) group(d map[string]interface{}) (*Group, error) {
typ, ok := d["type"].(string)
if !ok {
return nil, fmt.Errorf("type should be string")
}
res := &Group{}
switch typ {
case "group":
dicts, ok := d["dicts"].([]interface{})
if !ok {
return nil, fmt.Errorf("dicts field invalid")
}
for _, dict := range dicts {
d, ok := dict.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("dicts items invalid")
}
group, err := cc.group(d)
if err != nil {
return nil, err
}
res.Files = append(res.Files, group.Files...)
res.Dicts = append(res.Dicts, group.Dicts...)
}
case "txt":
file, ok := d["file"]
if !ok {
return nil, fmt.Errorf("no file field found")
}
daDict, err := buildFromFile(file.(string))
if err != nil {
return nil, err
}
res.Files = append(res.Files, file.(string))
res.Dicts = append(res.Dicts, daDict)
default:
return nil, fmt.Errorf("type should be txt or group")
}
return res, nil
}
// convert string from Simplified Chinese to Traditional Chinese or vice versa
func convert(in, conversion string) (string, error) {
if conversion == "" {
conversion = defaultConversion
}
for _, group := range conversions[conversion].DictGroup {
r := []rune(in)
var tokens []string
for i := 0; i < len(r); {
s := r[i:]
var token string
max := 0
for _, dict := range group.Dicts {
ret, err := dict.prefixMatch(string(s))
if err != nil {
return "", err
}
if len(ret) > 0 {
o := ""
for k, v := range ret {
if len(k) > max {
max = len(k)
token = v[0]
o = k
}
}
i += len([]rune(o))
break
}
}
if max == 0 { //no match
token = string(r[i])
i++
}
tokens = append(tokens, token)
}
in = strings.Join(tokens, "")
}
return in, nil
}