Source code

001/**
002 * Portions Copyright 2001 Sun Microsystems, Inc.
003 * Portions Copyright 1999-2001 Language Technologies Institute, 
004 * Carnegie Mellon University.
005 * All Rights Reserved.  Use is subject to license terms.
006 * 
007 * See the file "license.terms" for information on usage and
008 * redistribution of this file, and for a DISCLAIMER OF ALL 
009 * WARRANTIES.
010 */
011package com.sun.speech.freetts;
012
013import java.io.BufferedReader;
014import java.io.IOException;
015import java.io.InputStreamReader;
016import java.net.URL;
017import java.util.HashMap;
018import java.util.Map;
019import java.util.NoSuchElementException;
020import java.util.StringTokenizer;
021
022/**
023 * Implementation of a <code>PartOfSpeech</code> that reads the info
024 * from a file.  The format of the file is as follows:
025 *
026 * <pre>
027 * word pos
028 * word pos
029 * word pos
030 * ...
031 * </pre>
032 *
033 * Where <code>word</code> is the word and <code>pos</code> is the
034 * part of speech for the word.  The part of speech is implementation
035 * dependent.
036 */
037public class PartOfSpeechImpl implements PartOfSpeech {
038    /**
039     * Used for informational purposes if there's a bad line in the
040     * file.
041     */ 
042    private int lineCount = 0;
043
044    /**
045     * A map from words to their part of speech.
046     */
047    private Map partOfSpeechMap;
048
049    /**
050     * Default part of speech.
051     */
052    private String defaultPartOfSpeech;
053
054    /**
055     * Creates a new PartOfSpeechImpl by reading from the given URL.
056     *
057     * @param url the input source
058     * @param defaultPartOfSpeech the default part of speech
059     *
060     * @throws IOException if an error occurs
061     */ 
062    public PartOfSpeechImpl(URL url, String defaultPartOfSpeech) 
063        throws IOException {
064        
065        BufferedReader reader;
066        String line;
067
068        partOfSpeechMap = new HashMap();
069        this.defaultPartOfSpeech = defaultPartOfSpeech;
070        reader = new BufferedReader(new
071                InputStreamReader(url.openStream()));
072        line = reader.readLine();
073        lineCount++;
074        while (line != null) {
075            if (!line.startsWith("***")) {
076                parseAndAdd(line);
077            }
078            line = reader.readLine();
079        }
080        reader.close();
081    }
082    
083    /**
084     * Creates a word from the given input line and adds it to the map.
085     *
086     * @param line the input line
087     */
088    private void parseAndAdd(String line) {
089        StringTokenizer tokenizer = new StringTokenizer(line," ");
090        try {
091            String word = tokenizer.nextToken();
092            String pos = tokenizer.nextToken();        
093            partOfSpeechMap.put(word, pos);
094        } catch (NoSuchElementException nse) {
095            System.err.println("part of speech data in bad format at line " 
096            + lineCount);
097        }
098    }
099
100    /**
101     * Returns a description of the part of speech given a word.
102     * If the given word cannot be found, the part of speech will be the
103     * <code>defaultPartOfSpeech</code> parameter passed to the constructor.
104     *
105     * @param word the word to classify
106     *
107     * @return an implementation dependent part of speech for the word
108     */
109    public String getPartOfSpeech(String word) {
110        String pos = (String) partOfSpeechMap.get(word);
111        if (pos == null) {
112            pos = defaultPartOfSpeech;
113        }
114        return pos;
115    }
116}