# -*- coding: utf-8 -*-

"""
cwtex 4.1
(cwtex41.py, tex2xtc41.py, cwbiblatex41.py, cwmkidx41.py)

Copyright (C) 2016 Tsong-Min Wu and Tsong-Huey Wu
  with support of ...

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, see <http://www.gnu.org/licenses/>
"""


import os, sys, logging, unicodedata, argparse
from cwfont_decode import convert

parser = argparse.ArgumentParser(description="tex2xtc processor version 4.1")
parser.add_argument("infile")

parser.add_argument("-s", "--strokes", action="store_true", help="sorted by strokes")
parser.add_argument("-y", "--zhuyin", action="store_true", help="sorted by zhuyin")

args = parser.parse_args()

if args.infile:
    inputfile = args.infile

# Default: inputfile has file extension .tex,

if inputfile.split(".")[0] == inputfile:  # No file extension
    inputfile = inputfile + ".tex"



file = open(inputfile, encoding='latin-1')
outputfile = inputfile.split(".")[0] + ".xtc"
file_object = open(outputfile, 'w', encoding='utf-8')

logging.basicConfig(format='%(lineno)d %(message)s')

sorton = 0
if args.strokes:
    sorton = 1

if args.zhuyin:
    sorton = 2

import re
chincode = re.compile(r'{\\(MES|ME|MS|M|BBES|BBE|BBS|BB|RES|RE|RS|R|FES|FE|FS|F|KES|KE|KS|K)[a-z]+Q\\cH[\d]+}')
verbcode = re.compile(r'üþ(MES|ME|MS|M|BBES|BBE|BBS|BB|RES|RE|RS|R|FES|FE|FS|F|KES|KE|KS|K)[a-z]+QþcH[\d]+ý')
zz = re.compile(r'\\(z|Z|zZ)')
#fontfamily = re.compile(r'\\fontfamily  {cwM}\\fontseries  {')
fontfamily = re.compile(r'{\\fontfamily  {cwM}\\fontseries  {')
fontfamilycw = re.compile(r'\\fontfamily  {cw')
fontseries = re.compile(r'\\fontseries  {')
selectfont = re.compile(r'\\selectfont  \\char ')
num = re.compile(r' [0-9]')
hskip = re.compile(r'\\hskip 0.0pt plus0.2pt minus0.1pt{')
#idxentry = re.compile(r'\\indexentry{{\\')
idxentry = re.compile(r'\\indexentry{')
lines = file.readlines()

for linei in range(len(lines)):
    str1 = lines[linei]
    i = 0
    k = 0
    chinese = 0
    foundsign = 0
    idxstr=""

    # ith char in current line
    linelength = len(str1)
    while i != linelength:
        j=0
        p = 0
        charstr=""
        if chincode.match(str1[i:]) or verbcode.match(str1[i:]):    # Maybe {\MESaaQ\cH116}
            if str1[i+3].islower():
                if str1[i+4].islower(): # found {\MaaQ\cH116}
                    fontchar=str1[i+3]+str1[i+4]
                    k=9
                else:                   # found {\MaQ\cH116}
                    fontchar=str1[i+3]
                    k=8
            elif str1[i+4].islower():
                if str1[i+5].islower(): # found {\MEaaQ\cH116}
                    fontchar=str1[i+4]+str1[i+5]
                    k=10
                else:                   # found {\MEaQ\cH116}
                    fontchar=str1[i+4]
                    k=9
            else :                      #str1[i+5].islower()
                if str1[i+6].islower(): # found {\MESaaQ\cH116}
                    fontchar=str1[i+5]+str1[i+6]
                    k=11
                else:                   # found {\MESaQ\cH116}
                    fontchar=str1[i+5]
                    k=10

            while str1[i+k+j] != '}' and str1[i+k+j] != 'ý':
                char1=str1[i+k+j]
                charstr=charstr+char1
                j=j+1
            charno=charstr

            p=k+j

            utf8str, j012=convert(fontchar,charno, sorton)
            if chinese == 0:
                file_object.write(chr(utf8str))
            else :   # index
                idxstr = idxstr + chr(utf8str)
                if j012 > 26:
                    file_object.write("ZZ"+chr(j012-26+64))
                else :
                    file_object.write(chr(j012+64))

                file_object.write(str(utf8str))
        elif zz.match(str1[i:]):
            if str1[i+2]=='Z':
                p=2
            else:
                p=1
        elif fontfamily.match(str1[i:]) and sorton == 1:
            i = i + 33   # 32
            charstr=""
            while str1[i] != '}' :
                charstr=charstr+str1[i]
                i = i + 1

            fontchar=chr(int(charstr) + 97)
            i = i + 1
            if selectfont.match(str1[i:]) :
                i = i + 18
                charstr=""
                while str1[i] != '}' :
                    charstr=charstr+str1[i]
                    i = i + 1
                charno=charstr
                if str1[i+1] == '\\':
                    i = i + 1
                    if hskip.match(str1[i:]) :
                        i = i + 32 #33
                #elif str1[i+1] == '!':   # 4/13
                    #i = i + 1
                    #file_object.write("!")

                else:
                    i = i
            utf8str, j012=convert(fontchar,charno, 1)
            if chinese == 0 :
                file_object.write(chr(utf8str))
            else :
                idxstr = idxstr + chr(utf8str)
                if j012 > 26:
                    file_object.write("ZZ"+chr(j012-26+64))
                else :
                    file_object.write(chr(j012+64))

                file_object.write(str(utf8str))
                if str1[i] == '!' and chinese == 1:
                    file_object.write("0"+idxstr + str1[i])
                    file_object.write(chr(64102))
                    idxstr=""
        elif fontfamilycw.match(str1[i:]): # 2016/05/11
            charstr = ""

            while not fontseries.match(str1[i:]):
                i = i + 1
                #file_object.write(str1[i])

            while str1[i] != '{' :
                i = i + 1
            i = i + 1
            charstr=""
            while str1[i] != '}' :
                charstr=charstr+str1[i]
                i = i + 1

            fontchar=chr(int(charstr) + 97)
            while not selectfont.match(str1[i:]) :
                i = i + 1
                #file_object.write(str1[i:])

            while not num.match(str1[i:]) :
                i = i + 1

            charstr=""
            while str1[i] != '}' :
                charstr=charstr+str1[i]
                i = i + 1
            charno=charstr

            utf8str, j012=convert(fontchar,charno, 0)
            file_object.write(chr(utf8str))

        elif idxentry.match(str1[i:]) and sorton == 1:
            i= i + 11
            if str1[i+1] == '{' and str1[i+2] == '\\' :
                chinese = 1
                file_object.write("\\indexentry{"+chr(64102))
            else :
                chinese = 0
                file_object.write("\\indexentry{")

        elif (str1[i] == '!' or str1[i] == '\n') and chinese == 1:
            if foundsign == 0:
                foundsign = 1
            else:
                foundsign = 2
            file_object.write("0"+idxstr + str1[i])
            if str1[i] == '!':
                file_object.write(chr(64102))
            idxstr=""
        else:
            p=0
            y = str1[i]

            if chinese == 0:
                file_object.write(y)
            else :
                idxstr = idxstr + y  # index
        i = i+1+p

print('[' + str(linei+1) + ']')

file.close()
file_object.close()


