001/** 002 * Portions Copyright 2001 Sun Microsystems, Inc. 003 * Portions Copyright 1999-2001 Language Technologies Institute, 004 * Carnegie Mellon University. 005 * All Rights Reserved. Use is subject to license terms. 006 * 007 * See the file "license.terms" for information on usage and 008 * redistribution of this file, and for a DISCLAIMER OF ALL 009 * WARRANTIES. 010 */ 011package com.sun.speech.freetts; 012 013import java.io.BufferedReader; 014import java.io.IOException; 015import java.io.InputStreamReader; 016import java.net.URL; 017import java.util.HashMap; 018import java.util.Map; 019import java.util.NoSuchElementException; 020import java.util.StringTokenizer; 021 022/** 023 * Implementation of a <code>PartOfSpeech</code> that reads the info 024 * from a file. The format of the file is as follows: 025 * 026 * <pre> 027 * word pos 028 * word pos 029 * word pos 030 * ... 031 * </pre> 032 * 033 * Where <code>word</code> is the word and <code>pos</code> is the 034 * part of speech for the word. The part of speech is implementation 035 * dependent. 036 */ 037public class PartOfSpeechImpl implements PartOfSpeech { 038 /** 039 * Used for informational purposes if there's a bad line in the 040 * file. 041 */ 042 private int lineCount = 0; 043 044 /** 045 * A map from words to their part of speech. 046 */ 047 private Map partOfSpeechMap; 048 049 /** 050 * Default part of speech. 051 */ 052 private String defaultPartOfSpeech; 053 054 /** 055 * Creates a new PartOfSpeechImpl by reading from the given URL. 056 * 057 * @param url the input source 058 * @param defaultPartOfSpeech the default part of speech 059 * 060 * @throws IOException if an error occurs 061 */ 062 public PartOfSpeechImpl(URL url, String defaultPartOfSpeech) 063 throws IOException { 064 065 BufferedReader reader; 066 String line; 067 068 partOfSpeechMap = new HashMap(); 069 this.defaultPartOfSpeech = defaultPartOfSpeech; 070 reader = new BufferedReader(new 071 InputStreamReader(url.openStream())); 072 line = reader.readLine(); 073 lineCount++; 074 while (line != null) { 075 if (!line.startsWith("***")) { 076 parseAndAdd(line); 077 } 078 line = reader.readLine(); 079 } 080 reader.close(); 081 } 082 083 /** 084 * Creates a word from the given input line and adds it to the map. 085 * 086 * @param line the input line 087 */ 088 private void parseAndAdd(String line) { 089 StringTokenizer tokenizer = new StringTokenizer(line," "); 090 try { 091 String word = tokenizer.nextToken(); 092 String pos = tokenizer.nextToken(); 093 partOfSpeechMap.put(word, pos); 094 } catch (NoSuchElementException nse) { 095 System.err.println("part of speech data in bad format at line " 096 + lineCount); 097 } 098 } 099 100 /** 101 * Returns a description of the part of speech given a word. 102 * If the given word cannot be found, the part of speech will be the 103 * <code>defaultPartOfSpeech</code> parameter passed to the constructor. 104 * 105 * @param word the word to classify 106 * 107 * @return an implementation dependent part of speech for the word 108 */ 109 public String getPartOfSpeech(String word) { 110 String pos = (String) partOfSpeechMap.get(word); 111 if (pos == null) { 112 pos = defaultPartOfSpeech; 113 } 114 return pos; 115 } 116}