summaryrefslogtreecommitdiff
path: root/Nagoya University OPAC.js
blob: 35903b28ad194558cd19ca59e5a662f02369208f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
{
	"translatorID": "b56d756e-814e-4b46-bc58-d61dccc9f32f",
	"label": "Nagoya University OPAC",
	"creator": "Frank Bennett",
	"target": "^https?://opac\\.nul\\.nagoya-u\\.ac\\.jp/webopac/(catdbl.do|ctlsrh\\.do)",
	"minVersion": "2.0b7",
	"maxVersion": "",
	"priority": 100,
	"inRepository": true,
	"translatorType": 4,
	"browserSupport": "gcsibv",
	"lastUpdated": "2012-07-13 07:33:49"
}

// #######################
// ##### Sample URLs #####
// #######################

/*
 * The site is session-based, with page content negotiated
 * in POST calls.  The starting point for an OPAC search is
 * the URL below.  In testing, I tried the following:
 *
 *   - A search listing of books
 *   - A search listing of journals (no icon)
 *   - A mixed search listing of books and journals
 *   - A journal page (no icon)
 *   - A book page
 */
// http://opac.nul.nagoya-u.ac.jp/webopac/catsrk.do



// #####################
// ##### Constants #####
// #####################

/*
 * Strings corresponding to variables
*/
var pageStrings = {
	title: ['タイトル / 著者','Title / Author'],
	year: ['出版・頒布','Publication'],
	isbn: ['ISBN','ISBN'],
	authors: ['著者名リンク','Author link'],
	series: ['シリーズ情報','Series information']
};

var itemUrlBase = "http://opac.nul.nagoya-u.ac.jp/webopac/catdbl.do";

// ############################
// ##### String functions #####
// ############################

/*
 * Chop a semicolon-delimited string of authors out of a raw title string,
 * check it for Japanese characters, and save the raw string for each author
 * to an array.  If no Japanese authors were found, save directly to the item
 * object.
 */
var parseRomanAuthors = function (item,data) {
	var datastring = data['title'][0];
	// don't bother if there is no author info
	if ( ! datastring.match(/.*\/.*/) ) {
		return true;
	}
	// cut off the title
	datastring = datastring.replace(/.*\//, "");
	// raise flag if there are japanese characters
	var japanese_check = datastring.match(/.*[^- &0-9()\[\];:,.a-zA-Z].*/);
	// replace comma with semicolon in certain cases, to prepare for split
	datastring = datastring.replace(/,(\s+[a-zA-Z]{3,})/, ";$1");
	datastring = datastring.replace(/,(\s+[a-zA-Z]{1}[^a-zA-Z])/, ";$1");
	datastring = datastring.replace(/(\s+and\s+)/, "; ");
	datastring = datastring.replace(/(\s+&\s+)/, "; ");
	// split the authors
	var authors = datastring.replace(/\|.*/, "").split(";");
	// this is parsing the authors for a single work.  if there is a special byline, we
	// assume that it applies to all subsequent entries until overridden.
	var authortype = 'author';
	for (i in authors) {
		item.authorstrings.push(authors[i]);
		var authortypehint = authors[i].replace(/^([ ,.:a-z]*).*/, "$1");
		if ( authortypehint.match(/.*(edit|organiz).*/) ) {
			authortype = "editor";
		} else if ( authortypehint.match(/.*trans.*/) ) {
			authortype = "translator";
		}
		var author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" );
		// need to test for length because the replacement of commas with semicolons
		// can cause a short split at the end of a byline that originally ended in a comma
		if ( ! japanese_check && author.length ) {
			item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
		}
	}
	return japanese_check;
}

/*
 * For each author link, attempt to find a hint that the person
 * is an editor or translator, first in the link text itself, then in
 * the list of raw author strings captured by parseRomanAuthors.
 * Clean out cruft, reverse the order of each name, and save
 * directly to the item object.
 */
var parseJapaneseAuthors = function (item, data) {
	var authortype = author;
	var authors = data['authors'];
	for (i in authors ) {
		if ( authors[i].match(/.*編.*/) ) {
			authortype = 'editor';
		} else if ( authors[i].match(/.*訳.*/) ) {
			authortype = 'translator';
		} else {
			authortype = 'author';
		}
		var author = authors[i].replace(/[*]/g,"").replace(/[0-9<()|].*/, "").replace(/(.*?),(.*)/, "$2 $1");
		// If we claim to be an author, double-check in the English entries for a translator hint.
		// This is an enormous pain, but the original records are a mess, with different conventions
		// for Japanese and foreign records, sometimes mixed up in the same entry.  What are you
		// going to do.
		for ( x in item.authorstrings ) {
			var authorstring = item.authorstrings[x];
			var name = author.split(" ");
			name.reverse();
			if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(訳|譯|譯註)$/) ) {
				authortype = 'translator';
				break;
			} else if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(編|編著)$/) ) {
				authortype = 'editor';
				break;
			}
		}
		delete item.authorstrings;
		item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
	}
}

/*
 * Split extracted title field.  This always starts as a single list item,
 * but can contain entries for several works, as in an omnibus volume of
 * translated works, for example.  Such records separate the elements of
 * the omnibus with periods that have no trailing space, so we use that as
 * the split point.  We discard the phonetic information appended to the end
 * of the string in Japanese records.
 */
function splitTitle(data) {
	// split in data array
	var titlestring = data['title'][0].replace(/\|.*/, "");
	data['title'] = titlestring.split(" . ");
}

// ##########################
// ##### Page functions #####
// ##########################

/*
 * When getlist argument is nil, return a value when the target
 * index DOM contains at least one book entry, otherwise
 * return false.
 *
 * When getlist argument is true, return a list of
 * array items for book entries in the DOM.
 */
var sniffIndexPage = function(doc,getlist){
	var check = doc.evaluate("//td[div[@class='lst_value' and contains(text(),'Books')]]/following-sibling::td",  doc, null, XPathResult.ANY_TYPE, null);
	var node = check.iterateNext();
	if (getlist){
		var ret = new Object();
		while (node){
			var myitems = Zotero.Utilities.getItemArray(
							  doc,
							  node,
							  "document\\.catsrhform\\.pkey.value=");
			for (var r in myitems){
				ret[r] = myitems[r];
			}
			node = check.iterateNext();
		}
		return ret;
	} else {
		return node;
	}
};

/*
 * Invoke sniffIndexPage to generate a list of book
 * items in the target DOM.
 */
var getBookItems = function(doc){
	return sniffIndexPage(doc,true);
};

/*
 * Extract data from the DOM using the var-string pairs in
 * pageStrings as a guide to navigation.
 */
var scrapePage = function(doc, spec) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;
	var data = new Object();
	for (key in spec) {
		var check = doc.evaluate("//th[div[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]]/following-sibling::td/div", doc, nsResolver, XPathResult.ANY_TYPE, null);
		var c = check.iterateNext();
		while (c) {
			if (!data[key] ) {
				data[key] = new Array();
			}
			data[key].push(Zotero.Utilities.trimInternal(c.textContent));
			c = check.iterateNext();
		}
	}
	return data;
};

/*
 * Bring it all together.
 */
function scrapeAndParse(doc,url) {
	if (!detectWeb(doc,url)){
		return false;
	}
	var item = new Zotero.Item("book");
	item.authorstrings = new Array();
	var data = scrapePage(doc, pageStrings);
	splitTitle(data);

	if (data['title']) {
		var titles = new Array();
		for (i in data['title']) {
			titles.push( data['title'][i].replace(/\s+\/.*/, "") );
		}
		item.title = titles.join(", ");
		var jse_authors = parseRomanAuthors( item, data );
		if ( jse_authors ) {
			parseJapaneseAuthors( item, data );
		}
	}

	if (data['year']) {
		// sometimes there are multiple "date" fields, some of which are filled
		// with other random information
		for (i in data['year']) {
			var year = data['year'][i];
			if ( year.match(/.*[0-9]{3}.*/) ) {
				item.date = year.replace(/.*?([0-9][.0-9][0-9]+).*/, "$1");
				item.place = year.replace(/:.*/, "").replace(/[\[\]]/g, "");
				item.publisher = year.replace(/.*:(.*),.*/, "$1");
				break;
			}
		}
	}

	if (data['series']) {
		item.series = data['series'][0].replace(/[/|<].*/, "");
	}

	if (data['isbn']) {
		item.ISBN = data['isbn'][0].replace(/[^0-9]*([0-9]+).*/, "$1");
	}
	item.complete();
}

// #########################
// ##### API functions #####
// #########################

function detectWeb(doc, url) {
	if (url.match(/.*\/webopac\/catdbl.do/)) {
		var journal_test = doc.evaluate( '//th[div[contains(text(),"Frequency of publication") or contains(text(),"刊行頻度") or contains(text(),"巻号") or contains(text(),"Volumes")]]',  doc, null, XPathResult.ANY_TYPE, null).iterateNext();
		if (!journal_test) {
			return "book";
		}
	} else if (url.match(/.*\/webopac\/ctlsrh.do/)){
		if (sniffIndexPage(doc)){
			return "multiple";
		}
	}
	return false;
}

function doWeb(doc, url) {
	var format = detectWeb(doc, url);
	if (format == "multiple") {
		var items = {};
		for (var u in Zotero.selectItems( getBookItems(doc) )){
			var m = u.match(/.*document\.catsrhform\.pkey\.value=\'([^\']+)\'.*/);
			items[itemUrlBase+"?pkey="+m[1]+"&initFlg=_RESULT_SET_NOTBIB"] = true;
		}
		var urls = [];
		for (var u in items){
			urls.push(u);
		}
		ZU.processDocuments(u, scrapeAndParse);
	} else if (format == "book"){
		scrapeAndParse(doc, url);
	}
}
/** BEGIN TEST CASES **/
var testCases = [
	{
		"type": "web",
		"url": "http://opac.nul.nagoya-u.ac.jp/webopac/catdbl.do?pkey=TY50091937&initFlg=_RESULT_SET_NOTBIB",
		"items": [
			{
				"itemType": "book",
				"creators": [
					{
						"firstName": "Jeremy",
						"lastName": "Adelman",
						"creatorType": "author"
					}
				],
				"notes": [],
				"tags": [],
				"seeAlso": [],
				"attachments": [],
				"authorstrings": " Jeremy Adelman",
				"title": "Frontier development : land, labour, and capital on the wheatlands of Argentina and Canada, 1890-1914",
				"date": "1994",
				"place": "Oxford",
				"publisher": "Clarendon Press",
				"series": "Oxford historical monographs",
				"ISBN": "0198204418",
				"libraryCatalog": "Nagoya University OPAC",
				"shortTitle": "Frontier development"
			}
		]
	}
]
/** END TEST CASES **/