-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTransformJob.php
254 lines (203 loc) · 9 KB
/
TransformJob.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
<?php
/**
* This script scans an html file specified by the $url variable for html link elements;
Checks the http HEAD for the link, and if the contenttype header parameter contains 'xml',
Gets the file, and does some simple introspection to identify the metadata dialect in the content,
If a know dialect is recognized, transforms the file to ISO19139 XML using an xslt for that dialect,
The ISO1939 result is placed in a location on the server running this php script.
The location is specified by the $thedir variable.
The work is done by the parse_dir function. input parameters for the function are defined by
the variables defined below ($base_url, $url, and $thedir).
The specified URL location will be scanned recursively-- if one of the links points at an html
file (content type contains html), the parse_dir function is called recursively to examine that file for links. the final token
(after the last '/') is used to generate a subdirectory to hold any transformed metadata from that
link.
This routine is designed for scanning web accessible folders conataining
metadata intended for harvesting.
This code is based on example from http://htmlparsing.com/php.html
SMR 2018-06-01 Version 1.0
*/
# $url is the location of the root directory that contains xml metadata records
# or other subdirectories
# $base_url is the base url for relative links found in documents at $url
# $thedir is the path to a file system directory accessible by the server running
# this script; an output directory tree will be build there based on what is
# found at $url
#$base_url="http://hydro10.sdsc.edu";
$base_url="http://132.249.238.169:8080";
#$url = "http://hydro10.sdsc.edu/metadata/Wyoming_GeoLibrary/";
$url = "http://132.249.238.169:8080/metadata/";
$thedir="./sitemaptest/";
# transform files from various metadata dialects to ISO19139
# transforms are loaded from the USGIN organization metadataTransforms gitHub repository
$DataCitetoISOXslfile = file_get_contents("https://raw.githubusercontent.com/usgin/metadataTransforms/master/dataciteToISO19139v3.2.xslt");
$DublinCoretoISOXslfile = file_get_contents("https://raw.githubusercontent.com/usgin/metadataTransforms/master/qualifiedDCToISO19139v1.0.xslt");
$EMLtoISOXslfile=file_get_contents("https://raw.githubusercontent.com/usgin/metadataTransforms/master/eml2iso19139.xsl");
#eml transform has not been tested!
$CSDGMtoISOXslfile=file_get_contents("https://raw.githubusercontent.com/usgin/metadataTransforms/master/csdgm2iso19115_usgin3.0.xslt");
# set up the transform before entering loop so don't have to read the xsl file each time.
$xslt = new XSLTProcessor();
# this is where the work gets done
function parse_dir($target,$url,$base_url){
global $xslt, $DataCitetoISOXslfile, $DublinCoretoISOXslfile, $EMLtoISOXslfile, $CSDGMtoISOXslfile, $filecount;
# target is the directory where result files will be written
# url is the url for the file on the web to scan for links
# $base_url is the base url that will be used to resolve relative links found in the file at URL.
#beware of odd behavior using global xslt processor in recursive calls....
# Use the Curl extension to query the url
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$html = curl_exec($ch);
curl_close($ch);
# Create a DOM parser object
$dom = new DOMDocument();
#get the last segment of the URL, this will be a new directory in the output directory
if (substr(trim($url), -1)=='/'){
$url=substr($url,0,strlen($url)-1);
}
$urltokens = explode('/', $url);
#$dirname = $urltokens[count($urltokens)-1];
$host=$urltokens[0]."//".$urltokens[2];
echo "host: ".$host."<br/>";
# Load the content at the URL as html.
# The @ before the method call suppresses any warnings that
# loadHTML might throw because of invalid HTML in the page.
@$dom->loadHTML($html);
echo "dirname: ".$target."<br/>";
echo "location: ".$url."<br/>";
$filecount = 0;
# Iterate over all the <a> tags
foreach($dom->getElementsByTagName('a') as $link) {
# Show the <a href>
$thehref = $link->getAttribute('href');
if (strpos($url,$thehref)){
echo $thehref." is link to ancestor directory"."<br/>";
continue;
}
$service = $host.$thehref;
#echo "host: ". $host . ", href: ".$thehref."<br/>";
# this is an html call-- slows things
$headers = get_headers($service, 1);
if (strpos($headers[0],'HTTP/1.1 200') === false) {
echo $service." did not return a valid response"."<br/>";
continue;
}
$contenttype=$headers["Content-Type"];
#echo "Service: ".$service."<br/>";
#echo '<pre>'; print_r($headers);
#echo '</pre>'; */
#echo "Content type: ".$contenttype;
echo "<br/>";
#$thetokens=explode('/', $thehref);
if (substr(trim($thehref), -1)=='/'){
$thehref=substr($thehref,0,strlen($thehref)-1);
}
$urltokens = explode('/', $thehref);
#echo '<pre>'; print_r($urltokens);
#echo '</pre>';
if (!strpos($contenttype,'xml')){
echo $thehref." is not an xml file";
echo "<br />";
if (strpos($contenttype,'html')){
# if the link returns html, get the last segment of the URL,
# and use to generate a new directory in the output tree for
# metadata found at links there
$dirname = $urltokens[count($urltokens)-1];
echo "dirname: ". $dirname ."<br/>";
$thedir = $target.$dirname."/";
if (!is_dir($thedir)){
echo "mkdir: ".$thedir."<br/>";
mkdir($thedir);
}
echo "call pars_dir: " . $thedir . ", " .$service. "<br/>";
parse_dir( $thedir,$service,$base_url );
}
} else {
#set up the file name for the ISO output
$thetoken=$urltokens[count($urltokens)-1]; #get the last segment of the href
$thetoken = str_replace("%3A","-",$thetoken); #change URL encoded ':' characters to '-'
if (substr($thetoken,-3,3)=='xml'){
$thetoken=str_replace(".xml","-iso.xml",$thetoken);
} else {
$thetoken=$thetoken."-iso.xml";
}
$my_file = $target.$thetoken;
#echo $my_file." next file <br/>";
#check if the output file is already in the target directory;
# code won't overwrite existing files with this check
if (file_exists($my_file)){
echo $my_file." already processed";
echo "<br/>";
continue;
}
if (strpos($headers[0],'HTTP/1.1 200') !== false) {
try {
echo "processing <a href='".$service."'>".$thetoken."</a>";
echo "<br />";
$content = file_get_contents($service);
#figure out which transform to use
$teststring=substr($content,0,500); #take the first 500 characters
# these tests are pretty rudimentary.... let's see if they're good enough.
if (strpos($teststring,"MD_Metadata")){
echo $service." is already ISO19139 <br/>";
continue;
} elseif (strpos($teststring,"MI_Metadata")) {
echo $service." is ISO19139-2 <br/>";
continue;
} elseif (strpos($teststring,"eml")) {
echo $service." is eml <br/>";
$xslt->importStylesheet(new SimpleXMLElement($EMLtoISOXslfile));
} elseif (strpos($teststring,"idinfo")) {
echo $service." is CSDGM <br/>";
$xslt->importStylesheet(new SimpleXMLElement($CSDGMtoISOXslfile));
} elseif (strpos($teststring,"datacite.org/schema")) {
echo $service." is DataCite xml <br/>";
$xslt->importStylesheet(new SimpleXMLElement($CSDGMtoISOXslfile));
} elseif (strpos($teststring,"www.openarchives.org/OAI/2.0")) {
echo $service." is OAI Dublin core <br/>";
$xslt->importStylesheet(new SimpleXMLElement($DublinCoretoISOXslfile));
} elseif (strpos($teststring,"csw:record")) {
echo $service." is CSW record Dublin core <br/>";
$xslt->importStylesheet(new SimpleXMLElement($DublinCoretoISOXslfile));
} elseif (strpos($teststring,"rdf:Description")) {
if (strpos($content,"dc:title")){
echo $service." is RDF:Descriptions wrapped Dublin core <br/>";
$xslt->importStylesheet(new SimpleXMLElement($DublinCoretoISOXslfile));
} else {
echo $service." has rdf:Description, no dc:title <br/>";
}
} else {
echo $service." has an unrecognized metadata format <br/>";
continue;
}
$newxml = $xslt->transformToXml(new SimpleXMLElement($content));
# echo $newxml;
if (!is_dir($target)){
echo "mkdir: ".$target."<br/>";
mkdir($target);
}
$handle = fopen($my_file, 'w') or die('Cannot open file: '.$my_file); //implicitly creates file
fwrite($handle, $newxml);
fclose($handle);
$filecount = $filecount + 1;
echo "<br/>";
// break;
} catch (Exception $message) {
echo 'Caught exception: ', $message->getMessage(), "\n";
}
} else {
echo "Invalid URL, please try again.";
echo "<br />";
}
}
} #end of for each link loop
} # end of parse_dir function definition
if (!is_dir($thedir)){
mkdir($thedir);
}
parse_dir( $thedir,$url,$base_url );
echo "hey, I finished! ".$filecount." files processed";
?>