tags:

views:

46

answers:

1

I'm trying to convert old text in ascii non-english font to new unicode font. So the keys hav to maped. I have to option. First thing is i have a map file like this sample.map(txtfile)

w=à´‚
x=à´ƒ
A=à´…
B=à´†
C=à´‡
Cu=à´ˆ
D=à´‰
Du=à´Š
E=à´‹
\p=ഌ
F=à´Ž
G=à´
sF=à´
H=à´’
Hm=à´“
Hu=à´”
I=à´•
J=à´–

The code will have to replace all the left side character with the right side one. How do i loop through each character and replace them with taking information from the map file. I have been trying with find and replace techniques but with failure. How can I make php read this particular map file which txt file with .map extension and loop through each char and replace it without destroying the document?

Here is complete map file

I also found a python script to do this, i couldn't port it to php. Am very weak in python I'm pasting the code here:

import sys 
import codecs 
import os 
from optparse import OptionParser 

class Payyan:

 def __init__(self):
  self.input_filename =""
  self.output_filename=""
  self.mapping_filename=""
  self.rulesDict=None
  self.pdf=0

 def word2ASCII(self, unicode_text):
  index = 0
  prebase_letter = ""
  ascii_text=""
  self.direction = "u2a"
  self.rulesDict = self.LoadRules()
  while index < len(unicode_text):
   '''This takes care of conjuncts '''
   for charNo in [3,2,1]:
    letter = unicode_text[index:index+charNo]
    if letter in self.rulesDict:
     ascii_letter = self.rulesDict[letter]
     letter = letter.encode('utf-8')
     '''Fixing the prebase mathra'''
     '''TODO: Make it generic , so that usable for all indian languages'''
     if letter == 'ൈ':
      ascii_text = ascii_text[:-1] + ascii_letter*2 + ascii_text[-1:]
     elif (letter == 'ോ') | (letter == 'ൊ') | (letter == 'ൌ'): #prebase+postbase mathra case
      ascii_text = ascii_text[:-1] + ascii_letter[0] + ascii_text[-1:] + ascii_letter[1]
     elif (letter == 'െ') | (letter == 'േ') |(letter == 'àµà´°'): #only prebase
      ascii_text = ascii_text[:-1] + ascii_letter + ascii_text[-1:]
     else:
      ascii_text = ascii_text + ascii_letter      
     index = index+charNo
     break
    else:
     if(charNo==1):
      index=index+1
      ascii_text = ascii_text + letter
      break;
     '''Did not get'''    
     ascii_letter = letter

  return ascii_text

 def Uni2Ascii(self):
  if self.input_filename :
   uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
  else :
   uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')   
  text = ""
  if self.output_filename :
   output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')   
  while 1:
      text =uni_file.readline()
   if text == "":
    break
   ascii_text = "" 
   ascii_text = self.word2ASCII(text)

   if self.output_filename :
    output_file.write(ascii_text)
   else:
    print ascii_text.encode('utf-8')
  return 0

 def word2Unicode(self, ascii_text):
  index = 0
  post_index = 0
  prebase_letter = ""
  postbase_letter = ""
  unicode_text = ""
  next_ucode_letter = ""
  self.direction="a2u"
  self.rulesDict = self.LoadRules()
  while index < len(ascii_text):
   for charNo in [2,1]:
    letter = ascii_text[index:index+charNo]
    if letter in self.rulesDict:
     unicode_letter = self.rulesDict[letter]
     if(self.isPrebase(unicode_letter)): 
      prebase_letter = unicode_letter
     else:
      post_index = index+charNo
      if post_index < len(ascii_text):
       letter = ascii_text[post_index]
       if letter in self.rulesDict:
        next_ucode_letter = self.rulesDict[letter]
        if self.isPostbase(next_ucode_letter):
         postbase_letter = next_ucode_letter
         index = index + 1
      if  ((unicode_letter.encode('utf-8') == "à´Ž") |
          ( unicode_letter.encode('utf-8') == "à´’" )):
       unicode_text = unicode_text + postbase_letter + self.getVowelSign(prebase_letter , unicode_letter)
      else:
       unicode_text = unicode_text + unicode_letter + postbase_letter + prebase_letter
      prebase_letter=""
      postbase_letter=""
     index = index + charNo
     break
    else:
     if charNo == 1:
      unicode_text = unicode_text + letter
      index = index + 1
      break
     unicode_letter = letter
  return unicode_text 

 def Ascii2Uni(self):
  if self.pdf :
   command = "pdftotext '" + self.input_filename +"'"
   process = os.popen(command, 'r')
   status = process.close()
   if status:
    print "The input file is a PDF file. To convert this the  pdftotext  utility is required. "
    print "This feature is available only for GNU/Linux Operating system."
    return 1 # Error - no pdftotext !
   else:
    self.input_filename =  os.path.splitext(self.input_filename)[0] + ".txt"
  if self.input_filename :
   ascii_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
  else :
   ascii_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')   

  text = ""
  if self.output_filename :
   output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')   

  while 1:
      text =ascii_file.readline()
   if text == "":
    break
   unicode_text = ""
   unicode_text = self.word2Unicode(text)

   if self.output_filename :
    output_file.write(unicode_text)
   else:
    print unicode_text.encode('utf-8')
  return 0

 def getVowelSign(self, vowel_letter, vowel_sign_letter):
  vowel=  vowel_letter.encode('utf-8')
  vowel_sign=  vowel_sign_letter.encode('utf-8')
  if vowel == "à´Ž":
   if vowel_sign == "െ":
    return "à´"
  if vowel == "à´’":
   if vowel_sign == "à´¾":
    return "à´“"
   if vowel_sign =="ൗ":
    return "à´”"
  return (vowel_letter+ vowel_sign_letter)

 def isPrebase(self, letter):
   unicode_letter = letter.encode('utf-8')
   if(   ( unicode_letter == "േ"  ) | (   unicode_letter ==  "ൈ" ) |   ( unicode_letter ==  "ൊ" )  | ( unicode_letter ==  "ോ"  ) |  ( unicode_letter == "ൌ"  )
      |  ( unicode_letter == "àµà´°"  )  |  ( unicode_letter == "െ"  ) 
       ):
   return True
   else:
   return False

 def isPostbase(self, letter):
  unicode_letter = letter.encode('utf-8')
  if ( (unicode_letter == "àµà´¯") | (unicode_letter == "àµà´µ") ):
   return True
  else:
   return False

 def LoadRules(self): 
  if(self.rulesDict):
   return self.rulesDict
  rules_dict = dict()
  line = []
  line_number = 0
  rules_file = codecs. open(self.mapping_filename,encoding='utf-8', errors='ignore')
  while 1:
   ''' Keep the line number. Required for error reporting'''
   line_number = line_number +1 
      text = unicode( rules_file.readline())
   if text == "":
         break
   '''Ignore the comments'''
   if text[0] == '#': 
         continue 
   line = text.strip()
   if(line == ""):
      continue 
   if(len(line.split("=")) != 2):
     print "Error: Syntax Error in the Ascii to Unicode Map in line number ",  line_number
       print "Line: "+ text
       return 2 # Error - Syntax error in Mapping file 
    lhs = line.split("=") [ 0 ]  
    rhs = line.split("=") [ 1 ]  
   if self.direction == 'a2u':
    rules_dict[lhs]=rhs
   else:
    rules_dict[rhs]=lhs
  return rules_dict

UPDATE: I think i have been wrong about the ascii part. Its a text written in non-english font. Which I want to convert to unicode font so its properly shown on

A: 

ASCII is really just 7-bit. A guess is that you are dealing possibly with ISO-8859-1, to be converted to UTF-8 or other unicode encoding. Iconv can be used:

http://php.net/manual/en/function.iconv.php

動靜能量
may be what i mean a text written in a text editor with the particular font.
esafwan