`
chentianliang
  • 浏览: 15001 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

中文简体转化繁体中文

    博客分类:
  • java
阅读更多

public class Encoding {

 // Supported Encoding Types

 public static int GB2312 = 0;

 

 public static int GBK = 1;

 

 public static int HZ = 2;

 

 public static int BIG5 = 3;

 

 public static int CNS11643 = 4;

 

 public static int UTF8 = 5;

 

 public static int UNICODE = 6;

 

 public static int UNICODET = 7;

 

 public static int UNICODES = 8;

 

 public static int ISO2022CN = 9;

 

 public static int ISO2022CN_CNS = 10;

 

 public static int ISO2022CN_GB = 11;

 

 public static int ASCII = 12;

 

 public static int OTHER = 13;

 

 public static int TOTALTYPES = 14;

 

 // Names of the encodings as understood by Java

 public static String[] javaname;

 

 // Names of the encodings for human viewing

 public static String[] nicename;

 

 // Names of charsets as used in charset parameter of HTML Meta tag

 public static String[] htmlname;

 

 static {

  javaname = new String[TOTALTYPES];

  nicename = new String[TOTALTYPES];

  htmlname = new String[TOTALTYPES];

 

  // Assign encoding names

  javaname[GB2312] = "GB2312";

  javaname[HZ] = "ASCII"; // What to put here? Sun doesn't support HZ

  javaname[GBK] = "GBK";

  javaname[ISO2022CN_GB] = "ISO2022CN_GB";

  javaname[BIG5] = "BIG5";

  javaname[CNS11643] = "EUC-TW";

  javaname[ISO2022CN_CNS] = "ISO2022CN_CNS";

  javaname[ISO2022CN] = "ISO2022CN";

  javaname[UTF8] = "UTF8";

  javaname[UNICODE] = "Unicode";

  javaname[UNICODET] = "Unicode";

  javaname[UNICODES] = "Unicode";

  javaname[ASCII] = "ASCII";

  javaname[OTHER] = "ISO8859_1";

 

  // Assign encoding names

  htmlname[GB2312] = "GB2312";

  htmlname[HZ] = "HZ-GB-2312";

  htmlname[GBK] = "GB2312";

  htmlname[ISO2022CN_GB] = "ISO-2022-CN-EXT";

  htmlname[BIG5] = "BIG5";

  htmlname[CNS11643] = "EUC-TW";

  htmlname[ISO2022CN_CNS] = "ISO-2022-CN-EXT";

  htmlname[ISO2022CN] = "ISO-2022-CN";

  htmlname[UTF8] = "UTF-8";

  htmlname[UNICODE] = "UTF-16";

  htmlname[UNICODET] = "UTF-16";

  htmlname[UNICODES] = "UTF-16";

  htmlname[ASCII] = "ASCII";

  htmlname[OTHER] = "ISO8859-1";

 

  // Assign Human readable names

  nicename[GB2312] = "GB-2312";

  nicename[HZ] = "HZ";

  nicename[GBK] = "GBK";

  nicename[ISO2022CN_GB] = "ISO2022CN-GB";

  nicename[BIG5] = "Big5";

  nicename[CNS11643] = "CNS11643";

  nicename[ISO2022CN_CNS] = "ISO2022CN-CNS";

  nicename[ISO2022CN] = "ISO2022 CN";

  nicename[UTF8] = "UTF-8";

  nicename[UNICODE] = "Unicode";

  nicename[UNICODET] = "Unicode (Trad)";

  nicename[UNICODES] = "Unicode (Simp)";

  nicename[ASCII] = "ASCII";

  nicename[OTHER] = "OTHER";

 

 }

 

}

 

 

 

==========================================================================

public class EncodingTranslate extends Encoding {

 // Simplfied/Traditional character equivalence hashes

 static protected Hashtable<String, String> s2thash, t2shash;

 

 static {

  String dataline;

 

  // Initialize and load in the simplified/traditional character hashses

  s2thash = new Hashtable<String, String>();

  t2shash = new Hashtable<String, String>();

 

  try {

   InputStream pydata = EncodingTranslate.class

     .getResourceAsStream("hcutf8.txt");

   BufferedReader in = new BufferedReader(new InputStreamReader(

     pydata, "UTF8"));

   while ((dataline = in.readLine()) != null) {

    // Skip empty and commented lines

    if (dataline.length() == 0 || dataline.charAt(0) == '#') {

     continue;

    }

 

    // Simplified to Traditional, (one to many, but pick only one)

    s2thash.put(dataline.substring(0, 1).intern(), dataline

      .substring(1, 2));

 

    // Traditional to Simplified, (many to one)

    for (int i = 1; i < dataline.length(); i++) {

     t2shash.put(dataline.substring(i, i + 1).intern(), dataline

       .substring(0, 1));

    }

   }

  } catch (Exception e) {

   System.err.println(e);

  }

 

 }

 

 public static String convertString(String dataline, int source_encoding,

   int target_encoding) {

  StringBuffer outline = new StringBuffer();

  int lineindex;

 

  if (source_encoding == HZ) {

   dataline = hz2gb(dataline);

  }

  for (lineindex = 0; lineindex < dataline.length(); lineindex++) {

   if ((source_encoding == GB2312 || source_encoding == GBK

     || source_encoding == ISO2022CN_GB || source_encoding == HZ

     || source_encoding == UNICODE

     || source_encoding == UNICODES || source_encoding == UTF8)

     && (target_encoding == BIG5 || target_encoding == CNS11643

       || target_encoding == UNICODET || target_encoding == ISO2022CN_CNS)) {

    if (s2thash.containsKey(dataline.substring(lineindex,

      lineindex + 1)) == true) {

     outline.append(s2thash.get(dataline.substring(lineindex,

       lineindex + 1).intern()));

    } else {

     outline

       .append(dataline

         .substring(lineindex, lineindex + 1));

    }

   } else if ((source_encoding == BIG5 || source_encoding == CNS11643

     || source_encoding == UNICODET || source_encoding == UTF8

     || source_encoding == ISO2022CN_CNS

     || source_encoding == GBK || source_encoding == UNICODE)

     && (target_encoding == GB2312

       || target_encoding == UNICODES

       || target_encoding == ISO2022CN_GB || target_encoding == HZ)) {

    if (t2shash.containsKey(dataline.substring(lineindex,

      lineindex + 1)) == true) {

     outline.append(t2shash.get(dataline.substring(lineindex,

       lineindex + 1).intern()));

    } else {

     outline

       .append(dataline

         .substring(lineindex, lineindex + 1));

    }

   } else {

    outline.append(dataline.substring(lineindex, lineindex + 1));

   }

  }

 

  if (target_encoding == HZ) {

   // Convert to look like HZ

   return gb2hz(outline.toString());

  }

 

  return outline.toString();

 }

 

 public static String hz2gb(String hzstring) {

  byte[] hzbytes = new byte[2];

  byte[] gbchar = new byte[2];

  int byteindex = 0;

  StringBuffer gbstring = new StringBuffer("");

 

  try {

   hzbytes = hzstring.getBytes("8859_1");

  } catch (Exception usee) {

   System.err.println("Exception " + usee.toString());

   return hzstring;

  }

 

  // Convert to look like equivalent Unicode of GB

  for (byteindex = 0; byteindex < hzbytes.length; byteindex++) {

   if (hzbytes[byteindex] == 0x7e) {

    if (hzbytes[byteindex + 1] == 0x7b) {

     byteindex += 2;

     while (byteindex < hzbytes.length) {

      if (hzbytes[byteindex] == 0x7e

        && hzbytes[byteindex + 1] == 0x7d) {

       byteindex++;

       break;

      } else if (hzbytes[byteindex] == 0x0a

        || hzbytes[byteindex] == 0x0d) {

       gbstring.append((char) hzbytes[byteindex]);

       break;

      }

      gbchar[0] = (byte) (hzbytes[byteindex] + 0x80);

      gbchar[1] = (byte) (hzbytes[byteindex + 1] + 0x80);

      try {

       gbstring.append(new String(gbchar, "GB2312"));

      } catch (Exception usee) {

       System.err.println("Exception " + usee.toString());

      }

      byteindex += 2;

     }

    } else if (hzbytes[byteindex + 1] == 0x7e) { // ~~ becomes ~

     gbstring.append('~');

    } else { // false alarm

     gbstring.append((char) hzbytes[byteindex]);

    }

   } else {

    gbstring.append((char) hzbytes[byteindex]);

   }

  }

  return gbstring.toString();

 }

 

 public static String gb2hz(String gbstring) {

  StringBuffer hzbuffer;

  byte[] gbbytes = new byte[2];

  int i;

  boolean terminated = false;

 

  hzbuffer = new StringBuffer("");

  try {

   gbbytes = gbstring.getBytes("GB2312");

  } catch (Exception usee) {

   System.err.println(usee.toString());

   return gbstring;

  }

 

  for (i = 0; i < gbbytes.length; i++) {

   if (gbbytes[i] < 0) {

    hzbuffer.append("~{");

    terminated = false;

    while (i < gbbytes.length) {

     if (gbbytes[i] == 0x0a || gbbytes[i] == 0x0d) {

      hzbuffer.append("~}" + (char) gbbytes[i]);

      terminated = true;

      break;

     } else if (gbbytes[i] >= 0) {

      hzbuffer.append("~}" + (char) gbbytes[i]);

      terminated = true;

      break;

     }

     hzbuffer.append((char) (gbbytes[i] + 256 - 0x80));

     hzbuffer.append((char) (gbbytes[i + 1] + 256 - 0x80));

     i += 2;

    }

    if (terminated == false) {

     hzbuffer.append("~}");

    }

   } else {

    if (gbbytes[i] == 0x7e) {

     hzbuffer.append("~~");

    } else {

     hzbuffer.append((char) gbbytes[i]);

    }

   }

  }

  return new String(hzbuffer);

 }

 

 public static void convertFile(String sourcefile, String outfile,

   int source_encoding, int target_encoding) {

  BufferedReader srcbuffer;

  BufferedWriter outbuffer;

  String dataline;

 

  try {

   srcbuffer = new BufferedReader(new InputStreamReader(

     new FileInputStream(sourcefile), javaname[source_encoding]));

   outbuffer = new BufferedWriter(new OutputStreamWriter(

     new FileOutputStream(outfile), javaname[target_encoding]));

   while ((dataline = srcbuffer.readLine()) != null) {

    outbuffer.write(convertString(dataline, source_encoding,

      target_encoding));

    outbuffer.newLine();

   }

   srcbuffer.close();

   outbuffer.close();

  } catch (Exception ex) {

   System.err.println(ex);

  }

 }

 

 public static void main(String argc[]) {

  int codetypes[];

  char codetype;

 

  // Determine source and target encodings, store in codetypes

  codetypes = new int[2];

  argc[0] = argc[0].toLowerCase();

  for (int i = 0; i < 2; i++) {

   codetype = argc[0].charAt(i + 1);

   // Print Help

   if (codetype == 'h') {

    System.exit(0);

   }

 

   if (codetype == 'g') {

    codetypes[i] = GB2312;

   } else if (codetype == 'h') {

    codetypes[i] = HZ;

   } else if (codetype == 'b') {

    codetypes[i] = BIG5;

   } else if (codetype == 'c') {

    codetypes[i] = CNS11643;

   } else if (codetype == '8') {

    codetypes[i] = UTF8;

   } else if (codetype == 'u') {

    codetypes[i] = UNICODE;

   } else if (codetype == 't') {

    codetypes[i] = UNICODET;

   } else if (codetype == 's') {

    codetypes[i] = UNICODES;

   } else if (codetype == 'i') {

    codetypes[i] = ISO2022CN;

   } else if (codetype == '2') {

    codetypes[i] = ISO2022CN_GB;

   } else if (codetype == 'n') {

    codetypes[i] = ISO2022CN_CNS;

   } else if (codetype == 'k') {

    codetypes[i] = GBK;

   }

  }

 

  // Call the file convert function with appropriate arguments

  EncodingTranslate.convertFile(argc[1], argc[2], codetypes[0], codetypes[1]);

 }

 


}

 

 

 

//测试方法 注意附件的文件请放在同一目录下

 

public static void main(String[] args) {
           system.out.println(EncodingTranslate.convertString("测试数据安定发生地发生法撒旦法",Encoding.GBK,Encoding.BIG5)); //转化成繁体中文
}

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics