aboutsummaryrefslogtreecommitdiff
path: root/src/com/benlinskey/grdbc/LexiconParser.java
blob: 46dcad548b60768ce7938d6376da62272968b5cf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* Copyright 2013 Benjamin Linskey
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.benlinskey.grdbc;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import edu.unc.epidoc.transcoder.TransCoder;

/**
 * This class provides methods to parse a chunk of XML containing a lexicon
 * entry, modify the data contained therein, and return data to be inserted
 * into the database.
 * @author Ben Linskey
 */
public class LexiconParser {
	private Document doc;
	private TransCoder transcoder;
	
	/**
	 * Class constructor.
	 * @param xml	the XML to parse
	 * @throws ParserConfigurationException 
	 * @throws IOException 
	 * @throws SAXException 
	 */
	public LexiconParser(String xml) throws ParserConfigurationException, SAXException, IOException {
		// Parse the XML and create a Document.
		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
		DocumentBuilder db = dbf.newDocumentBuilder();
		InputSource is = new InputSource(new StringReader(xml));
		doc = db.parse(is);
		
		// Create a TransCoder for converting Beta Code to Greek characters.
		try {
			transcoder = new TransCoder("BetaCode", "UnicodeC");
		} catch (Exception e) {
			e.printStackTrace();
			System.exit(1);
		}
	}
	
	/**
	 * Returns a Beta Code representation of this entry's word, stripped of
	 * all diacritics.
	 * @return	this entry's word in Beta Code without diacritics
	 */
	public String getBetaNoSymbols() {
		// Get the word and replace all symbols with an empty string.
		return getBetaSymbols().replaceAll("[^a-zA-Z]", "");
	}
	
	/**
	 * Returns a Beta Code representation of this entry's word.
	 * @return	this entry's word in Beta Code
	 */
	public String getBetaSymbols() {
		// We just need the "key" attribute from the "entry" element.
		Node entry = doc.getElementsByTagName("entry").item(0);
		return entry.getAttributes().getNamedItem("key").getTextContent();
	}
	
	/**
	 * Returns this entry's word in Greek characters.
	 * @return	this entry's word in Greek characters
	 */
	public String getGreekFullWord() {
		// Use the transcoder to convert the beta code to Greek.
		return betaToGreek(getBetaSymbols());
	}
	
	/**
	 * Returns this entry's word in Greek characters, stripped of all 
	 * diacritics.
	 * @return	this entry's word in Greek characters without diacritics
	 */
	public String getGreekNoSymbols() {
		// Get beta code with no symbols other than the capital letter marker.
		String beta = getBetaSymbols().replaceAll("[^a-zA-Z\\*]", "");
		
		// Use the transcoder to convert the beta code to Greek.
		return betaToGreek(beta);
	}
	
	/**
	 * Returns this entry's word in all lowercase Greek characters, stripped
	 * of all diacritics.
	 * @return	this entry's word in lowercase Greek characters without 
	 * 			diacritics
	 */
	public String getGreekLowercase() {
		return getGreekNoSymbols().toLowerCase();
	}
	
	/**
	 * Returns the XML for this entry, with all Beta Code converted to Greek
	 * characters.
	 * @return	the XML for this entry with all Beta Code converted to Greek
	 * 			characters
	 */
	public String getEntry() {
		transcodeInElements("orth");
		transcodeInElements("ref");
		transcodeInElements("foreign");
		return getUpdatedXML();
	}
	
	/**
	 * Transcodes beta code to Greek in elements with the given name.
	 * @param element	the name of the element to search for
	 */
	private void transcodeInElements(String element) {
		NodeList nodeList = doc.getElementsByTagName(element);
		for (int i = 0; i < nodeList.getLength(); i++) {
			Node elementNode = nodeList.item(i);
			Node langAttr = elementNode.getAttributes().getNamedItem("lang");
			if (langAttr != null) {
				String lang = langAttr.getTextContent();
				if (lang.equals("greek")) {
					String greek = betaToGreek(elementNode.getTextContent());
					langAttr.setTextContent(greek);
				}
			}
		}
	}
	
	/**
	 * Returns a string containing an XML representation of the document in its 
	 * current state.
	 * @return	a string containing an XML representation of the document in its 
	 * 			current state
	 */
	private String getUpdatedXML() {
		StringWriter writer = new StringWriter();
		try {
			TransformerFactory tf = TransformerFactory.newInstance();
			Transformer transformer = tf.newTransformer();
			DOMSource source = new DOMSource(doc);
			StreamResult result = new StreamResult(writer);
			transformer.transform(source, result);
		} catch (TransformerConfigurationException e) {
			e.printStackTrace();
			System.exit(1);
		} catch (TransformerException e) {
			e.printStackTrace();
			System.exit(1);
		}
		return writer.toString();
	}
	
	/**
	 * Converts Beta Code to Greek characters.
	 * @param beta	the Beta Code to transcode
	 * @return	the Greek equivalent of the specified Beta Code
	 */
	private String betaToGreek(String beta) {
		String greek = null;
		try {
			greek = transcoder.getString(beta); 
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
			System.exit(1);
		}
		return greek;
	}
}