loadXML($metanord); //validate the document if (!$xmlv->schemaValidate($metnord_schema)) { echo "error\t $entry \n"; libxml_display_errors(); } else { echo "valid\t $entry \n"; //save the document (format with DOM) $dom = new DOMDocument('1.0'); $dom->preserveWhiteSpace = false; $dom->formatOutput = true; $dom->loadXML($metanord); $dom->save($to_dir . $entry); } } } closedir($handle); } else { print "Error loading $dir"; } } /******* * takes a set of base descriptions and outputs metanord xml * @desc array of descriptions */ function dcToMetanord($desc) { //print_r($desc); //get the corpora info global $korp_info; //fix the licence-element if (stristr($desc['license'], "sb-info@svenska.gu.se") || stristr($desc['license'], "Egen:")) { $desc['license'] = 'CC_BY-SA_3.0'; } $licences = ''; if (stristr($desc['license'], ',')) { $tmp = explode(',', $desc['license']); foreach ($tmp as $l) { if (stristr($l, 'GFDL')) { $licences .= 'GFDL' . "\n"; } else if ($l == "CC-BY-SA 3.0") { $licences .= 'CC_BY-SA_3.0' . "\n"; } else if (stristr($l, 'LGPL 3.0')) { $licences .= 'LGPL' . "\n"; } else { $licences .= '' . trim($l) . '' . "\n"; } } } else { $licences = '' . $desc['license'] . ''; } $tokens = $desc['extent']; if (array_key_exists(strtoupper($desc['identifier']), $korp_info['corpora']) && is_numeric($korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Size'])) { $tokens = $korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Size']; } //begin building the output $output = ' ' . htmlspecialchars($desc['title']['en']) . ' ' . htmlspecialchars($desc['title']['sv']) . ' ' . htmlspecialchars($desc['description']['en']) . ' ' . htmlspecialchars($desc['description']['sv']) . ' ' . $desc['identifier'] . ' ' . $desc['identifier'] . ' '; if (!empty($desc['handle'])) $output .='' . $desc['handle'] . ''; else $output .='hdl:10794/' . uniqid("", true) . ''; $output .='' . $desc['identifier'] . ' '; $output .= 'available-restrictedUse'; if (array_key_exists('relation', $desc)) { if (array_key_exists('downloads', $desc['relation'])) { foreach ($desc['relation']['downloads'] as $d) { $output .= ' ' . $licences . ' shareAlike downloadable ' . $d['url'] . ' '; } } if (array_key_exists('accessibleThroughInterface', $desc['relation'])) { foreach ($desc['relation']['accessibleThroughInterface'] as $d) { $output .= ' ' . $licences . ' shareAlike accessibleThroughInterface ' . $d['url'] . ' '; } } if (array_key_exists('other', $desc['relation'])) { foreach ($desc['relation']['other'] as $o) { $output .= ' ' . $licences . ' shareAlike other ' . $o['url'] . ' '; } } } $output .= ' Forsberg Markus sb-info@svenska.gu.se Språkbanken, University of Gothenburg Språkbanken sb-info@svenska.gu.se Forsberg Markus sb-info@svenska.gu.se Språkbanken sb-info@svenska.gu.se ' . date("Y-m-d") . ' Olsson Olof sb-info@svenska.gu.se Språkbanken sb-info@svenska.gu.se META-NORD true formal automatic full '; //Documentation (manuals) $output .= ' http://spraakbanken.gu.se/eng/resource/' . $desc['identifier'] . ' manual online '; $output .=' '; //corpus if ($desc['type'] == 'corpus') { $output .=' corpus text monolingual swe Swedish Latn other ' . $tokens . ' tokens '; $output .=' ' . $tokens . ' tokens '; if (array_key_exists(strtoupper($desc['identifier']), $korp_info['corpora']) && is_numeric($korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Sentences'])) { $output .=' ' . $korp_info['corpora'][strtoupper($desc['identifier'])]['info']['Sentences'] . ' sentences '; } $output .=' '; } else if ($desc['type'] == 'lex') { $output .= ' lexicalConceptualResource computationalLexicon morphology LMF text monolingual swe Swedish writtenLanguage ' . $desc['extent'] . ' entries other ' . $desc['extent'] . ' entries general ' . $desc['extent'] . ' entries other ' . $desc['extent'] . ' entries other ' . $desc['extent'] . ' entries '; } $output .= ' '; return $output; } /********** * extracts the descriptions from the Dublin Core elements * @xml xml for the dublin core * returns array of the elements */ function _getBaseDescriptionArray($xml) { $title_sv = $xml->xpath('//dc:title[@xml:lang = "sv"]'); $title_en = $xml->xpath('//dc:title[@xml:lang = "en"]'); $description_sv = $xml->xpath('//dc:description[@xml:lang = "sv"]'); $description_en = $xml->xpath('//dc:description[@xml:lang = "en"]'); $extent = $xml->xpath('//dcterms:extent'); $identifier = $xml->xpath('//dc:identifier'); $license = $xml->xpath('//dc:license'); $type = $xml->xpath('//dc:type'); $tmp['identifier'] = (string) $identifier[0]; $tmp['extent'] = (string) $extent[0]; $tmp['type'] = (string) $type[0]; $tmp['license'] = (string) $license[0]; $tmp['title']['sv'] = (string) $title_sv[0]; $tmp['title']['en'] = (string) $title_en[0]; $tmp['description']['sv'] = (string) $description_sv[0]; $tmp['description']['en'] = (string) $description_en[0]; foreach ($xml->xpath('//dc:relation') as $rel) { $resource = array(); $url = (string) $rel[0]; $resource['url'] = trim($url); $xsi = $rel->attributes("xsi", 1); $format = (string) $xsi['format']; $type = (string) $xsi['type']; //"dcterms:URI" $resource['label'] = $format; if (!empty($resource['url'])) { if (strpos(strtolower($format), 'xml') !== false || strpos(strtolower($format), 'lmf') !== false || strpos(strtolower($format), 'html') !== false || strpos(strtolower($format), 'bz2') !== false || strpos(strtolower($format), 'txt') !== false || strpos(strtolower($format), 'xls') !== false || strpos($url, 'pub/reskit/parole.zip') !== false) { $tmp['relation']['downloads'][] = $resource; } else if (strpos($url, 'korp/#corpus=') !== false || strpos($url, 'glossa/html') !== false || strpos($url, 'FTS/search.phtml') !== false || strpos($url, 'http://www.medicinskkorpus.se/login.phtml') !== false || strpos($url, 'http://www.dramawebben.se/') !== false || strpos($url, 'http://sv.wikipedia.org/') !== false || strpos($url, 'FTS/search.phtml') !== false || strpos($url, 'FTS/search.phtml') !== false || strpos($url, 'karp') !== false) { $tmp['relation']['accessibleThroughInterface'][] = $resource; } else if (strpos($url, 'korp/ws') !== false || strpos($url, 'simple_parole_index.html') !== false || strpos($url, 'http://spraakbanken.gu.se/strindberg/') !== false || strpos($url, 'korp/ws') !== false || strpos($url, 'korp/ws') !== false) { $tmp['relation']['resourceDocumentationInfo']['manual'][] = $resource; } else { $tmp['relation']['other'][] = $resource; } //print $resource['url'] . "\n"; } } return $tmp; } /****** * get the identifiers from a folder containing existing metanord-records * @path path to the folder * return array of identifiers */ function getLegacyIDs($path) { $result = array(); if ($handle = opendir($path)) { while (false !== ($entry = readdir($handle))) { if ($entry != "." && $entry != ".." && $entry != ".svn") { $xml = simplexml_load_file($path . '/' . $entry); $pid = $xml->IdentificationInfo->pid; $resourceShortName = $xml->IdentificationInfo->resourceShortName; $identifier = $xml->IdentificationInfo->identifier; $resourceShortName = strtolower($resourceShortName); if (strpos($resourceShortName, 'derwall supplement')) $resourceShortName = "soederwall_supp"; else if (strpos($resourceShortName, 'derwall')) $resourceShortName = "soederwall"; if (!empty($resourceShortName)) { $result[$resourceShortName]['name'] = $resourceShortName; $result[$resourceShortName]['pid'] = (string) $pid; $result[$resourceShortName]['handle'] = 'hdl:10794/' . $result[$resourceShortName]['pid']; } } } } return $result; } //////////////////////////////////////////////////////////////////////////////// function libxml_display_error($error) { $return = "\t"; switch ($error->level) { case LIBXML_ERR_WARNING: $return .= "Warning $error->code : "; break; case LIBXML_ERR_ERROR: $return .= "Error $error->code : "; break; case LIBXML_ERR_FATAL: $return .= "Fatal Error $error->code : "; break; } $return .= trim($error->message); if ($error->file) { $return .= " in $error->file"; } $return .= " on line $error->line\t"; return $return . "\n"; } function libxml_display_errors() { $errors = libxml_get_errors(); foreach ($errors as $error) { print libxml_display_error($error); } libxml_clear_errors(); } ?>