loadXML($metanord);
//validate the document
if (!$xmlv->schemaValidate($metnord_schema)) {
echo "error\t $entry \n";
libxml_display_errors();
} else {
echo "valid\t $entry \n";
//save the document (format with DOM)
$dom = new DOMDocument('1.0');
$dom->preserveWhiteSpace = false;
$dom->formatOutput = true;
$dom->loadXML($metanord);
$dom->save($to_dir . $entry);
}
}
}
closedir($handle);
} else {
print "Error loading $dir";
}
}
/*******
* takes a set of base descriptions and outputs metanord xml
* @desc array of descriptions
*/
function dcToMetanord($desc) {
//print_r($desc);
//get the corpora info
global $korp_info;
//fix the licence-element
if (stristr($desc['license'], "sb-info@svenska.gu.se") || stristr($desc['license'], "Egen:")) {
$desc['license'] = 'CC_BY-SA_3.0';
}
$licences = '';
if (stristr($desc['license'], ',')) {
$tmp = explode(',', $desc['license']);
foreach ($tmp as $l) {
if (stristr($l, 'GFDL')) {
$licences .= 'GFDL' . "\n";
} else if ($l == "CC-BY-SA 3.0") {
$licences .= 'CC_BY-SA_3.0' . "\n";
} else if (stristr($l, 'LGPL 3.0')) {
$licences .= 'LGPL' . "\n";
} else {
$licences .= '' . trim($l) . '' . "\n";
}
}
} else {
$licences = '' . $desc['license'] . '';
}
$tokens = $desc['extent'];
if (array_key_exists(strtoupper($desc['identifier']), $korp_info['corpora']) && is_numeric($korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Size'])) {
$tokens = $korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Size'];
}
//begin building the output
$output = '
' . htmlspecialchars($desc['title']['en']) . '
' . htmlspecialchars($desc['title']['sv']) . '
' . htmlspecialchars($desc['description']['en']) . '
' . htmlspecialchars($desc['description']['sv']) . '
' . $desc['identifier'] . '
' . $desc['identifier'] . '
';
if (!empty($desc['handle']))
$output .='' . $desc['handle'] . '';
else
$output .='hdl:10794/' . uniqid("", true) . '';
$output .='' . $desc['identifier'] . '
';
$output .= 'available-restrictedUse';
if (array_key_exists('relation', $desc)) {
if (array_key_exists('downloads', $desc['relation'])) {
foreach ($desc['relation']['downloads'] as $d) {
$output .= '
' . $licences . '
shareAlike
downloadable
' . $d['url'] . '
';
}
}
if (array_key_exists('accessibleThroughInterface', $desc['relation'])) {
foreach ($desc['relation']['accessibleThroughInterface'] as $d) {
$output .= '
' . $licences . '
shareAlike
accessibleThroughInterface
' . $d['url'] . '
';
}
}
if (array_key_exists('other', $desc['relation'])) {
foreach ($desc['relation']['other'] as $o) {
$output .= '
' . $licences . '
shareAlike
other
' . $o['url'] . '
';
}
}
}
$output .=
'
Forsberg
Markus
sb-info@svenska.gu.se
Språkbanken, University of Gothenburg
Språkbanken
sb-info@svenska.gu.se
Forsberg
Markus
sb-info@svenska.gu.se
Språkbanken
sb-info@svenska.gu.se
' . date("Y-m-d") . '
Olsson
Olof
sb-info@svenska.gu.se
Språkbanken
sb-info@svenska.gu.se
true
formal
automatic
full
';
//Documentation (manuals)
$output .= '
http://spraakbanken.gu.se/eng/resource/' . $desc['identifier'] . '
manual
online
';
$output .='
';
//corpus
if ($desc['type'] == 'corpus') {
$output .='
corpus
text
monolingual
swe
Swedish
Latn
other
' . $tokens . '
tokens
';
$output .='
' . $tokens . '
tokens
';
if (array_key_exists(strtoupper($desc['identifier']), $korp_info['corpora']) && is_numeric($korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Sentences'])) {
$output .='
' . $korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Sentences'] . '
sentences
';
}
$output .='
';
} else if ($desc['type'] == 'lex') {
$output .=
'
lexicalConceptualResource
computationalLexicon
morphology
LMF
text
monolingual
swe
Swedish
writtenLanguage
' . $desc['extent'] . '
entries
other
' . $desc['extent'] . '
entries
general
' . $desc['extent'] . '
entries
other
' . $desc['extent'] . '
entries
other
' . $desc['extent'] . '
entries
';
}
$output .= '
';
return $output;
}
/**********
* extracts the descriptions from the Dublin Core elements
* @xml xml for the dublin core
* returns array of the elements
*/
function _getBaseDescriptionArray($xml) {
$title_sv = $xml->xpath('//dc:title[@xml:lang = "sv"]');
$title_en = $xml->xpath('//dc:title[@xml:lang = "en"]');
$description_sv = $xml->xpath('//dc:description[@xml:lang = "sv"]');
$description_en = $xml->xpath('//dc:description[@xml:lang = "en"]');
$extent = $xml->xpath('//dcterms:extent');
$identifier = $xml->xpath('//dc:identifier');
$license = $xml->xpath('//dc:license');
$type = $xml->xpath('//dc:type');
$tmp['identifier'] = (string) $identifier[0];
$tmp['extent'] = (string) $extent[0];
$tmp['type'] = (string) $type[0];
$tmp['license'] = (string) $license[0];
$tmp['title']['sv'] = (string) $title_sv[0];
$tmp['title']['en'] = (string) $title_en[0];
$tmp['description']['sv'] = (string) $description_sv[0];
$tmp['description']['en'] = (string) $description_en[0];
foreach ($xml->xpath('//dc:relation') as $rel) {
$resource = array();
$url = (string) $rel[0];
$resource['url'] = trim($url);
$xsi = $rel->attributes("xsi", 1);
$format = (string) $xsi['format'];
$type = (string) $xsi['type']; //"dcterms:URI"
$resource['label'] = $format;
if (!empty($resource['url'])) {
if (strpos(strtolower($format), 'xml') !== false ||
strpos(strtolower($format), 'lmf') !== false ||
strpos(strtolower($format), 'html') !== false ||
strpos(strtolower($format), 'bz2') !== false ||
strpos(strtolower($format), 'txt') !== false ||
strpos(strtolower($format), 'xls') !== false ||
strpos($url, 'pub/reskit/parole.zip') !== false) {
$tmp['relation']['downloads'][] = $resource;
} else if (strpos($url, 'korp/#corpus=') !== false ||
strpos($url, 'glossa/html') !== false ||
strpos($url, 'FTS/search.phtml') !== false ||
strpos($url, 'http://www.medicinskkorpus.se/login.phtml') !== false ||
strpos($url, 'http://www.dramawebben.se/') !== false ||
strpos($url, 'http://sv.wikipedia.org/') !== false ||
strpos($url, 'FTS/search.phtml') !== false ||
strpos($url, 'FTS/search.phtml') !== false ||
strpos($url, 'karp') !== false) {
$tmp['relation']['accessibleThroughInterface'][] = $resource;
} else if (strpos($url, 'korp/ws') !== false ||
strpos($url, 'simple_parole_index.html') !== false ||
strpos($url, 'http://spraakbanken.gu.se/strindberg/') !== false ||
strpos($url, 'korp/ws') !== false ||
strpos($url, 'korp/ws') !== false) {
$tmp['relation']['resourceDocumentationInfo']['manual'][] = $resource;
} else {
$tmp['relation']['other'][] = $resource;
}
//print $resource['url'] . "\n";
}
}
return $tmp;
}
/******
* get the identifiers from a folder containing existing metanord-records
* @path path to the folder
* return array of identifiers
*/
function getLegacyIDs($path) {
$result = array();
if ($handle = opendir($path)) {
while (false !== ($entry = readdir($handle))) {
if ($entry != "." && $entry != ".." && $entry != ".svn") {
$xml = simplexml_load_file($path . '/' . $entry);
$pid = $xml->IdentificationInfo->pid;
$resourceShortName = $xml->IdentificationInfo->resourceShortName;
$identifier = $xml->IdentificationInfo->identifier;
$resourceShortName = strtolower($resourceShortName);
if (strpos($resourceShortName, 'derwall supplement'))
$resourceShortName = "soederwall_supp";
else if (strpos($resourceShortName, 'derwall'))
$resourceShortName = "soederwall";
if (!empty($resourceShortName)) {
$result[$resourceShortName]['name'] = $resourceShortName;
$result[$resourceShortName]['pid'] = (string) $pid;
$result[$resourceShortName]['handle'] = 'hdl:10794/' . $result[$resourceShortName]['pid'];
}
}
}
}
return $result;
}
////////////////////////////////////////////////////////////////////////////////
function libxml_display_error($error) {
$return = "\t";
switch ($error->level) {
case LIBXML_ERR_WARNING:
$return .= "Warning $error->code : ";
break;
case LIBXML_ERR_ERROR:
$return .= "Error $error->code : ";
break;
case LIBXML_ERR_FATAL:
$return .= "Fatal Error $error->code : ";
break;
}
$return .= trim($error->message);
if ($error->file) {
$return .= " in $error->file";
}
$return .= " on line $error->line\t";
return $return . "\n";
}
function libxml_display_errors() {
$errors = libxml_get_errors();
foreach ($errors as $error) {
print libxml_display_error($error);
}
libxml_clear_errors();
}
?>