package defpackage;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.lang.Character;
import java.util.LinkedList;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;

/* loaded from: input_file:segmenter.class */
public class segmenter {
    private TreeMap zhwords;
    public static final int TRAD = 0;
    public static final int SIMP = 1;
    public static final int BOTH = 2;
    private boolean debug = false;
    private String debugencoding = "UTF-8";
    private TreeSet csurname = new TreeSet();
    private TreeSet cforeign = new TreeSet();
    private TreeSet cnumbers = new TreeSet();
    private TreeSet cnotname = new TreeSet();

    public segmenter(int i, boolean z) {
        int i2 = 0;
        if (i == 1) {
            loadset(this.cnumbers, "data/snumbers_u8.txt");
            loadset(this.cforeign, "data/sforeign_u8.txt");
            loadset(this.csurname, "data/ssurname_u8.txt");
            loadset(this.cnotname, "data/snotname_u8.txt");
        } else if (i == 0) {
            loadset(this.cnumbers, "data/tnumbers_u8.txt");
            loadset(this.cforeign, "data/tforeign_u8.txt");
            loadset(this.csurname, "data/tsurname_u8.txt");
            loadset(this.cnotname, "data/tnotname_u8.txt");
        } else {
            loadset(this.cnumbers, "data/snumbers_u8.txt");
            loadset(this.cforeign, "data/sforeign_u8.txt");
            loadset(this.csurname, "data/ssurname_u8.txt");
            loadset(this.cnotname, "data/snotname_u8.txt");
            loadset(this.cnumbers, "data/tnumbers_u8.txt");
            loadset(this.cforeign, "data/tforeign_u8.txt");
            loadset(this.csurname, "data/tsurname_u8.txt");
            loadset(this.cnotname, "data/tnotname_u8.txt");
        }
        this.zhwords = new TreeMap();
        if (!z) {
            return;
        }
        InputStream inputStream = null;
        try {
            if (i == 1) {
                inputStream = getClass().getResourceAsStream("simplexu8.txt");
            } else if (i == 0) {
                inputStream = getClass().getResourceAsStream("tradlexu8.txt");
            } else if (i == 2) {
                inputStream = getClass().getResourceAsStream("bothlexu8.txt");
            }
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (readLine.indexOf("#") == -1) {
                    addword(readLine);
                    if (this.debug) {
                        int i3 = i2;
                        i2++;
                        if (i3 % 20000 == 0) {
                            System.err.println(i2);
                        }
                    }
                }
            }
        } catch (IOException e) {
            System.err.println(new StringBuffer().append("IOException: ").append(e).toString());
        }
    }

    private void loadset(TreeSet treeSet, String str) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream(str), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (readLine.indexOf("#") <= -1 && readLine.length() != 0) {
                    treeSet.add(readLine);
                }
            }
        } catch (Exception e) {
            System.err.println(new StringBuffer().append("Exception loading data file").append(str).append(" ").append(e).toString());
        }
    }

    public boolean isNumber(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (!this.cnumbers.contains(str.substring(i, i + 1))) {
                z = false;
                break;
            }
            i++;
        }
        if (this.debug) {
            printDebug(new StringBuffer().append(str).append(" ").append(z).toString());
        }
        return z;
    }

    public boolean isAllForeign(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (!this.cforeign.contains(str.substring(i, i + 1))) {
                z = false;
                break;
            }
            i++;
        }
        return z;
    }

    public boolean isNotCJK(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (Character.UnicodeBlock.of(str.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                z = false;
                break;
            }
            i++;
        }
        return z;
    }

    public String stemWord(String str) {
        String[] strArr = {"了", "的", "地", "下", "上", "中", "里", "到", "内", "外", "们"};
        String[] strArr2 = {"得", "不"};
        StringBuffer stringBuffer = new StringBuffer(str);
        for (String str2 : new String[]{"第", "副", "不"}) {
            if (stringBuffer.substring(0, 1).equals(str2) && (this.zhwords.get(stringBuffer.substring(1, stringBuffer.length())) != null || stringBuffer.length() == 2)) {
                stringBuffer.deleteCharAt(0);
                return stringBuffer.toString();
            }
        }
        for (String str3 : strArr) {
            if (stringBuffer.substring(stringBuffer.length() - 1, stringBuffer.length()).equals(str3) && (this.zhwords.get(stringBuffer.substring(0, stringBuffer.length() - 1)) != null || stringBuffer.length() == 2)) {
                System.out.println("Stemmed suffix");
                try {
                    System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                } catch (Exception e) {
                }
                stringBuffer.deleteCharAt(stringBuffer.length() - 1);
                return stringBuffer.toString();
            }
        }
        for (String str4 : strArr2) {
            if (stringBuffer.length() == 3 && stringBuffer.substring(1, 2).equals(str4) && this.zhwords.get(new String(new StringBuffer().append(stringBuffer.substring(0, 1)).append(stringBuffer.substring(2, 3)).toString())) != null) {
                System.out.println("Stemmed infix");
                stringBuffer.deleteCharAt(1);
                return stringBuffer.toString();
            }
        }
        return stringBuffer.toString();
    }

    public String segmentLine(String str, String str2) {
        int[] segmentLineOffsets = segmentLineOffsets(str);
        StringBuffer stringBuffer = new StringBuffer(str);
        int length = str2.length();
        if (segmentLineOffsets.length == 0) {
            return str;
        }
        for (int length2 = segmentLineOffsets.length - 2; length2 >= 0; length2--) {
            if (segmentLineOffsets[length2] > 0 && length2 + segmentLineOffsets[length2] != str.length() && !str.substring(length2, length2 + length).equals(str2) && !str.substring(length2 + segmentLineOffsets[length2], length2 + segmentLineOffsets[length2] + length).equals(str2)) {
                stringBuffer.insert(length2 + segmentLineOffsets[length2], str2);
            }
        }
        return stringBuffer.toString();
    }

    public LinkedList segmentLine(String str) {
        int[] segmentLineOffsets = segmentLineOffsets(str);
        LinkedList linkedList = new LinkedList();
        for (int i = 0; i < segmentLineOffsets.length; i++) {
            if (segmentLineOffsets[i] > 0) {
                linkedList.add(new Integer(i));
            }
        }
        return linkedList;
    }

    public int[] segmentLineOffsets(String str) {
        int length = str.length();
        int[] iArr = new int[length];
        if (this.debug) {
            System.out.println(new StringBuffer().append("Line length ").append(length).toString());
        }
        if (this.debug) {
            System.out.println("Grouping Chinese, letters, digits and spaces");
        }
        int i = 0;
        while (i < length) {
            if (this.debug) {
                System.out.println(new StringBuffer().append("i ").append(i).toString());
            }
            if (Character.UnicodeBlock.of(str.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                int i2 = i + 8 > length ? length - i : 8;
                while (i + i2 <= length && i2 > 1 && !this.zhwords.containsKey(str.substring(i, i + i2))) {
                    i2--;
                }
                iArr[i] = i2;
                i += i2;
            } else if (Character.isWhitespace(str.charAt(i))) {
                int i3 = 1;
                while (i + i3 < length && Character.isWhitespace(str.charAt(i + i3))) {
                    i3++;
                }
                iArr[i] = i3;
                i += i3;
            } else if (Character.isLetter(str.charAt(i))) {
                int i4 = 1;
                while (i + i4 < length && Character.isLetter(str.charAt(i + i4))) {
                    i4++;
                }
                iArr[i] = i4;
                i += i4;
            } else if (Character.isDigit(str.charAt(i))) {
                int i5 = 1;
                while (i + i5 < length && Character.isDigit(str.charAt(i + i5))) {
                    i5++;
                }
                iArr[i] = i5;
                i += i5;
            } else {
                iArr[i] = 1;
                i++;
            }
        }
        if (this.debug) {
            System.out.println("Grouping foreign transliterations");
        }
        for (int i6 = 0; i6 < length; i6++) {
            if (iArr[i6] > 0) {
                while (i6 + iArr[i6] < length && i6 + iArr[i6] + iArr[i6 + iArr[i6]] < length && isAllForeign(str.substring(i6, i6 + iArr[i6] + iArr[i6 + iArr[i6]]))) {
                    int i7 = iArr[i6 + iArr[i6]];
                    iArr[i6 + iArr[i6]] = 0;
                    iArr[i6] = iArr[i6] + i7;
                }
            }
        }
        if (this.debug) {
            System.out.println("Grouping numbers");
        }
        for (int i8 = 0; i8 < length; i8++) {
            if (iArr[i8] > 0) {
                while (i8 + iArr[i8] < length && i8 + iArr[i8] + iArr[i8 + iArr[i8]] < length && isNumber(str.substring(i8, i8 + iArr[i8] + iArr[i8 + iArr[i8]]))) {
                    int i9 = iArr[i8 + iArr[i8]];
                    iArr[i8 + iArr[i8]] = 0;
                    iArr[i8] = iArr[i8] + i9;
                }
            }
        }
        return iArr;
    }

    public void addword(String str) {
        this.zhwords.put(str, "1");
    }

    public void segmentFile(String str, String str2) {
        String stringBuffer = new StringBuffer().append(str).append(".seg").toString();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), str2));
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(stringBuffer), str2));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    bufferedWriter.close();
                    return;
                }
                String segmentLine = segmentLine(readLine, " ");
                if (0 != 0) {
                    System.err.println(new StringBuffer().append("Output: ").append(new String(segmentLine.getBytes(str2))).toString());
                }
                bufferedWriter.write(segmentLine);
                bufferedWriter.newLine();
            }
        } catch (Exception e) {
            System.err.println(new StringBuffer().append("Exception ").append(e.toString()).toString());
        }
    }

    public void printDebug(String str) {
        try {
            System.out.println(new String(str.getBytes(this.debugencoding)));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void printHelp() {
        System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8|-s|-t] inputfile.txt");
        System.out.println("\t-b Big5, -g GB2312, -8 UTF-8, -s simp. chars, -t trad. chars");
        System.out.println("  Segmented text will be saved to inputfile.txt.seg");
        System.exit(0);
    }

    public static void main(String[] strArr) {
        Vector vector = new Vector();
        String str = "BIG5";
        int i = 0;
        boolean z = false;
        for (int i2 = 0; i2 < strArr.length; i2++) {
            if (strArr[i2].equals("-b")) {
                if (z) {
                    System.out.println("Setting to Big5, TRAD");
                }
                str = "BIG5";
                i = 0;
            } else if (strArr[i2].equals("-g")) {
                if (z) {
                    System.out.println("Setting to GB, SIMP");
                }
                str = "GBK";
                i = 1;
            } else if (strArr[i2].equals("-8")) {
                str = "UTF8";
                i = 2;
            } else if (strArr[i2].equals("-s")) {
                if (z) {
                    System.out.println("Setting to UTF-8 SIMP");
                }
                str = "UTF8";
                i = 1;
            } else if (strArr[i2].equals("-t")) {
                if (z) {
                    System.out.println("Setting to UTF-8 TRAD");
                }
                str = "UTF8";
                i = 0;
            } else if (strArr[i2].equals("-h")) {
                printHelp();
            } else if (strArr[i2].equals("-d")) {
                z = true;
            } else {
                vector.add(strArr[i2]);
            }
        }
        if (vector.size() == 0) {
            System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
            printHelp();
        }
        System.err.println("Loading segmenter word list.  One moment please.");
        segmenter segmenterVar = new segmenter(i, true);
        System.err.println(new StringBuffer().append("Total keys ").append(segmenterVar.zhwords.size()).toString());
        for (int i3 = 0; i3 < vector.size(); i3++) {
            File file = new File((String) vector.get(i3));
            if (!file.exists()) {
                System.out.println(new StringBuffer().append("ERROR: Source file ").append((String) vector.get(i3)).append(" does not exist.\n").toString());
            } else if (file.isDirectory()) {
                String[] list = file.list();
                if (list != null) {
                    for (String str2 : list) {
                        vector.add(new StringBuffer().append((String) vector.get(i3)).append(File.separator).append(str2).toString());
                    }
                }
            } else {
                System.err.println(new StringBuffer().append("Segmenting ").append(vector.get(i3)).append(" with encoding ").append(str).toString());
                segmenterVar.segmentFile((String) vector.get(i3), str);
            }
        }
    }
}
