| File | /usr/share/perl5/MARC/Charset.pm |
| Statements Executed | 34 |
| Total Time | 0.0024698 seconds |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::BEGIN |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::_process_escape |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_encoding |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_unicode |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::ignore_errors |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::marc8_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::reset_charsets |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::utf8_to_marc8 |
| Line | Stmts. | Exclusive Time | Avg. | Code |
|---|---|---|---|---|
| 1 | package MARC::Charset; | |||
| 2 | ||||
| 3 | 1 | 900ns | 900ns | our $VERSION = '0.98'; |
| 4 | 3 | 34µs | 11µs | use strict; # spent 15µs making 1 call to strict::import |
| 5 | 3 | 32µs | 11µs | use warnings; # spent 30µs making 1 call to warnings::import |
| 6 | ||||
| 7 | 3 | 42µs | 14µs | use base qw(Exporter); # spent 74µs making 1 call to base::import |
| 8 | 1 | 2µs | 2µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); |
| 9 | ||||
| 10 | 3 | 170µs | 57µs | use Unicode::Normalize; # spent 70µs making 1 call to Exporter::import |
| 11 | 3 | 151µs | 50µs | use Encode 'decode'; # spent 95µs making 1 call to Exporter::import |
| 12 | 3 | 131µs | 44µs | use charnames ':full'; # spent 216µs making 1 call to charnames::import |
| 13 | 3 | 156µs | 52µs | use MARC::Charset::Table; # spent 4µs making 1 call to import |
| 14 | 3 | 1.72ms | 575µs | use MARC::Charset::Constants qw(:all); # spent 266µs making 1 call to Exporter::import |
| 15 | ||||
| 16 | =head1 NAME | |||
| 17 | ||||
| 18 | MARC::Charset - convert MARC-8 encoded strings to UTF-8 | |||
| 19 | ||||
| 20 | =head1 SYNOPSIS | |||
| 21 | ||||
| 22 | # import the marc8_to_utf8 function | |||
| 23 | use MARC::Charset 'marc8_to_utf8'; | |||
| 24 | ||||
| 25 | # prepare STDOUT for utf8 | |||
| 26 | binmode(STDOUT, 'utf8'); | |||
| 27 | ||||
| 28 | # print out some marc8 as utf8 | |||
| 29 | print marc8_to_utf8($marc8_string); | |||
| 30 | ||||
| 31 | =head1 DESCRIPTION | |||
| 32 | ||||
| 33 | MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8 | |||
| 34 | strings. MARC-8 is a single byte character encoding that predates unicode, and | |||
| 35 | allows you to put non-Roman scripts in MARC bibliographic records. | |||
| 36 | ||||
| 37 | http://www.loc.gov/marc/specifications/spechome.html | |||
| 38 | ||||
| 39 | =head1 EXPORTS | |||
| 40 | ||||
| 41 | =cut | |||
| 42 | ||||
| 43 | # get the mapping table | |||
| 44 | 1 | 10µs | 10µs | our $table = MARC::Charset::Table->new(); # spent 117µs making 1 call to MARC::Charset::Table::new |
| 45 | ||||
| 46 | # set default character sets | |||
| 47 | # these are viewable at the package level | |||
| 48 | # in case someone wants to set them | |||
| 49 | 1 | 800ns | 800ns | our $DEFAULT_G0 = ASCII_DEFAULT; |
| 50 | 1 | 600ns | 600ns | our $DEFAULT_G1 = EXTENDED_LATIN; |
| 51 | ||||
| 52 | =head2 ignore_errors() | |||
| 53 | ||||
| 54 | Tells MARC::Charset whether or not to ignore all encoding errors, and | |||
| 55 | returns the current setting. This is helepfuli if you have records that | |||
| 56 | contain both MARC8 and UNICODE characters. | |||
| 57 | ||||
| 58 | my $ignore = MARC::Charset->ignore_errors(); | |||
| 59 | ||||
| 60 | MARC::Charset->ignore_errors(1); # ignore errors | |||
| 61 | MARC::Charset->ignore_errors(0); # DO NOT ignore errors | |||
| 62 | ||||
| 63 | =cut | |||
| 64 | ||||
| 65 | ||||
| 66 | 1 | 400ns | 400ns | our $_ignore_errors = 0; |
| 67 | sub ignore_errors { | |||
| 68 | my ($self,$i) = @_; | |||
| 69 | $_ignore_errors = $i if (defined($i)); | |||
| 70 | return $_ignore_errors; | |||
| 71 | } | |||
| 72 | ||||
| 73 | ||||
| 74 | =head2 assume_unicode() | |||
| 75 | ||||
| 76 | Tells MARC::Charset whether or not to assume UNICODE when an error is | |||
| 77 | encountered in ignore_errors mode and returns the current setting. | |||
| 78 | This is helepfuli if you have records that contain both MARC8 and UNICODE | |||
| 79 | characters. | |||
| 80 | ||||
| 81 | my $setting = MARC::Charset->assume_unicode(); | |||
| 82 | ||||
| 83 | MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8) | |||
| 84 | MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode | |||
| 85 | ||||
| 86 | =cut | |||
| 87 | ||||
| 88 | ||||
| 89 | 1 | 500ns | 500ns | our $_assume = ''; |
| 90 | sub assume_unicode { | |||
| 91 | my ($self,$i) = @_; | |||
| 92 | $_assume = 'utf8' if (defined($i) and $i); | |||
| 93 | return 1 if ($_assume eq 'utf8'); | |||
| 94 | } | |||
| 95 | ||||
| 96 | ||||
| 97 | =head2 assume_encoding() | |||
| 98 | ||||
| 99 | Tells MARC::Charset whether or not to assume a specific encoding when an error | |||
| 100 | is encountered in ignore_errors mode and returns the current setting. This | |||
| 101 | is helpful if you have records that contain both MARC8 and other characters. | |||
| 102 | ||||
| 103 | my $setting = MARC::Charset->assume_encoding(); | |||
| 104 | ||||
| 105 | MARC::Charset->assume_encoding('cp850'); # assume characters are cp850 | |||
| 106 | MARC::Charset->assume_encoding(''); # DO NOT assume any encoding | |||
| 107 | ||||
| 108 | =cut | |||
| 109 | ||||
| 110 | ||||
| 111 | sub assume_encoding { | |||
| 112 | my ($self,$i) = @_; | |||
| 113 | $_assume = $i if (defined($i)); | |||
| 114 | return $_assume; | |||
| 115 | } | |||
| 116 | ||||
| 117 | ||||
| 118 | # place holders for working graphical character sets | |||
| 119 | 1 | 300ns | 300ns | my $G0; |
| 120 | 1 | 300ns | 300ns | my $G1; |
| 121 | ||||
| 122 | =head2 marc8_to_utf8() | |||
| 123 | ||||
| 124 | Converts a MARC-8 encoded string to UTF-8. | |||
| 125 | ||||
| 126 | my $utf8 = marc8_to_utf8($marc8); | |||
| 127 | ||||
| 128 | If you'd like to ignore errors pass in a true value as the 2nd | |||
| 129 | parameter or call MARC::Charset->ignore_errors() with a true | |||
| 130 | value: | |||
| 131 | ||||
| 132 | my $utf8 = marc8_to_utf8($marc8, 'ignore-errors'); | |||
| 133 | ||||
| 134 | or | |||
| 135 | ||||
| 136 | MARC::Charset->ignore_errors(1); | |||
| 137 | my $utf8 = marc8_to_utf8($marc8); | |||
| 138 | ||||
| 139 | =cut | |||
| 140 | ||||
| 141 | ||||
| 142 | sub marc8_to_utf8 | |||
| 143 | { | |||
| 144 | my ($marc8, $ignore_errors) = @_; | |||
| 145 | reset_charsets(); | |||
| 146 | ||||
| 147 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | |||
| 148 | ||||
| 149 | # holder for our utf8 | |||
| 150 | my $utf8 = ''; | |||
| 151 | ||||
| 152 | my $index = 0; | |||
| 153 | my $length = length($marc8); | |||
| 154 | my $combining = ''; | |||
| 155 | CHAR_LOOP: while ($index < $length) | |||
| 156 | { | |||
| 157 | # whitespace, line feeds and carriage returns just get added on unmolested | |||
| 158 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | |||
| 159 | { | |||
| 160 | $utf8 .= $1; | |||
| 161 | $index += 1; | |||
| 162 | next CHAR_LOOP; | |||
| 163 | } | |||
| 164 | ||||
| 165 | # look for any escape sequences | |||
| 166 | my $new_index = _process_escape(\$marc8, $index, $length); | |||
| 167 | if ($new_index > $index) | |||
| 168 | { | |||
| 169 | $index = $new_index; | |||
| 170 | next CHAR_LOOP; | |||
| 171 | } | |||
| 172 | ||||
| 173 | my $found; | |||
| 174 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | |||
| 175 | { | |||
| 176 | ||||
| 177 | # cjk characters are a string of three chars | |||
| 178 | my $char_size = $charset eq CJK ? 3 : 1; | |||
| 179 | ||||
| 180 | # extract the next code point to examine | |||
| 181 | my $chunk = substr($marc8, $index, $char_size); | |||
| 182 | ||||
| 183 | # look up the character to see if it's in our mapping | |||
| 184 | my $code = $table->lookup_by_marc8($charset, $chunk); | |||
| 185 | ||||
| 186 | # try the next character set if no mapping was found | |||
| 187 | next CHARSET_LOOP if ! $code; | |||
| 188 | $found = 1; | |||
| 189 | ||||
| 190 | # gobble up all combining characters for appending later | |||
| 191 | # this is necessary because combinging characters precede | |||
| 192 | # the character they modifiy in MARC-8, whereas they follow | |||
| 193 | # the character they modify in UTF-8. | |||
| 194 | if ($code->is_combining()) | |||
| 195 | { | |||
| 196 | $combining .= $code->char_value(); | |||
| 197 | } | |||
| 198 | else | |||
| 199 | { | |||
| 200 | $utf8 .= $code->char_value() . $combining; | |||
| 201 | $combining = ''; | |||
| 202 | } | |||
| 203 | ||||
| 204 | $index += $char_size; | |||
| 205 | next CHAR_LOOP; | |||
| 206 | } | |||
| 207 | ||||
| 208 | if (!$found) | |||
| 209 | { | |||
| 210 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | |||
| 211 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | |||
| 212 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | |||
| 213 | if (!$ignore_errors) | |||
| 214 | { | |||
| 215 | reset_charsets(); | |||
| 216 | return; | |||
| 217 | } | |||
| 218 | if ($_assume) | |||
| 219 | { | |||
| 220 | reset_charsets(); | |||
| 221 | return NFC(decode($_assume => $marc8)); | |||
| 222 | } | |||
| 223 | $index += 1; | |||
| 224 | } | |||
| 225 | ||||
| 226 | } | |||
| 227 | ||||
| 228 | # return the utf8 | |||
| 229 | reset_charsets(); | |||
| 230 | return $utf8; | |||
| 231 | } | |||
| 232 | ||||
| 233 | ||||
| 234 | ||||
| 235 | =head2 utf8_to_marc8() | |||
| 236 | ||||
| 237 | Will attempt to translate utf8 into marc8. | |||
| 238 | ||||
| 239 | my $marc8 = utf8_to_marc8($utf8); | |||
| 240 | ||||
| 241 | If you'd like to ignore errors, or characters that can't be | |||
| 242 | converted to marc8 then pass in a true value as the second | |||
| 243 | parameter: | |||
| 244 | ||||
| 245 | my $marc8 = utf8_to_marc8($utf8, 'ignore-errors'); | |||
| 246 | ||||
| 247 | or | |||
| 248 | ||||
| 249 | MARC::Charset->ignore_errors(1); | |||
| 250 | my $utf8 = marc8_to_utf8($marc8); | |||
| 251 | ||||
| 252 | =cut | |||
| 253 | ||||
| 254 | sub utf8_to_marc8 | |||
| 255 | { | |||
| 256 | my ($utf8, $ignore_errors) = @_; | |||
| 257 | reset_charsets(); | |||
| 258 | ||||
| 259 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | |||
| 260 | ||||
| 261 | # decompose combined characters | |||
| 262 | $utf8 = NFD($utf8); | |||
| 263 | ||||
| 264 | my $len = length($utf8); | |||
| 265 | my $marc8 = ''; | |||
| 266 | for (my $i=0; $i<$len; $i++) | |||
| 267 | { | |||
| 268 | my $slice = substr($utf8, $i, 1); | |||
| 269 | ||||
| 270 | # spaces are copied from utf8 into marc8 | |||
| 271 | if ($slice eq ' ') | |||
| 272 | { | |||
| 273 | $marc8 .= ' '; | |||
| 274 | next; | |||
| 275 | } | |||
| 276 | ||||
| 277 | # try to find the code point in our mapping table | |||
| 278 | my $code = $table->lookup_by_utf8($slice); | |||
| 279 | ||||
| 280 | if (! $code) | |||
| 281 | { | |||
| 282 | warn("no mapping found at position $i in $utf8"); | |||
| 283 | reset_charsets() and return unless $ignore_errors; | |||
| 284 | } | |||
| 285 | ||||
| 286 | # if it's a combining character move it around | |||
| 287 | if ($code->is_combining()) | |||
| 288 | { | |||
| 289 | my $prev = chop($marc8); | |||
| 290 | $marc8 .= $code->marc_value() . $prev; | |||
| 291 | next; | |||
| 292 | } | |||
| 293 | ||||
| 294 | # look to see if we need to escape to a new G0 charset | |||
| 295 | my $charset_value = $code->charset_value(); | |||
| 296 | ||||
| 297 | if ($code->default_charset_group() eq 'G0' | |||
| 298 | and $G0 ne $charset_value) | |||
| 299 | { | |||
| 300 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | |||
| 301 | { | |||
| 302 | # don't bother escaping, they're functionally the same | |||
| 303 | } | |||
| 304 | else | |||
| 305 | { | |||
| 306 | $marc8 .= $code->get_escape(); | |||
| 307 | $G0 = $charset_value; | |||
| 308 | } | |||
| 309 | } | |||
| 310 | ||||
| 311 | # look to see if we need to escape to a new G1 charset | |||
| 312 | elsif ($code->default_charset_group() eq 'G1' | |||
| 313 | and $G1 ne $charset_value) | |||
| 314 | { | |||
| 315 | $marc8 .= $code->get_escape(); | |||
| 316 | $G1 = $charset_value; | |||
| 317 | } | |||
| 318 | ||||
| 319 | $marc8 .= $code->marc_value(); | |||
| 320 | } | |||
| 321 | ||||
| 322 | # escape back to default G0 if necessary | |||
| 323 | if ($G0 ne $DEFAULT_G0) | |||
| 324 | { | |||
| 325 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | |||
| 326 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | |||
| 327 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | |||
| 328 | } | |||
| 329 | ||||
| 330 | # escape back to default G1 if necessary | |||
| 331 | if ($G1 ne $DEFAULT_G1) | |||
| 332 | { | |||
| 333 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | |||
| 334 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | |||
| 335 | } | |||
| 336 | ||||
| 337 | return $marc8; | |||
| 338 | } | |||
| 339 | ||||
| 340 | ||||
| 341 | ||||
| 342 | =head1 DEFAULT CHARACTER SETS | |||
| 343 | ||||
| 344 | If you need to alter the default character sets you can set the | |||
| 345 | $MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the | |||
| 346 | appropriate character set code: | |||
| 347 | ||||
| 348 | use MARC::Charset::Constants qw(:all); | |||
| 349 | $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC; | |||
| 350 | $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC; | |||
| 351 | ||||
| 352 | =head1 SEE ALSO | |||
| 353 | ||||
| 354 | =over 4 | |||
| 355 | ||||
| 356 | =item * L<MARC::Charset::Constant> | |||
| 357 | ||||
| 358 | =item * L<MARC::Charset::Table> | |||
| 359 | ||||
| 360 | =item * L<MARC::Charset::Code> | |||
| 361 | ||||
| 362 | =item * L<MARC::Charset::Compiler> | |||
| 363 | ||||
| 364 | =item * L<MARC::Record> | |||
| 365 | ||||
| 366 | =item * L<MARC::XML> | |||
| 367 | ||||
| 368 | =back | |||
| 369 | ||||
| 370 | =head1 AUTHOR | |||
| 371 | ||||
| 372 | Ed Summers (ehs@pobox.com) | |||
| 373 | ||||
| 374 | =cut | |||
| 375 | ||||
| 376 | ||||
| 377 | sub _process_escape | |||
| 378 | { | |||
| 379 | ## this stuff is kind of scary ... for an explanation of what is | |||
| 380 | ## going on here check out the MARC-8 specs at LC. | |||
| 381 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | |||
| 382 | my ($str_ref, $left, $right) = @_; | |||
| 383 | ||||
| 384 | # first char needs to be an escape or else this isn't an escape sequence | |||
| 385 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | |||
| 386 | ||||
| 387 | ## if we don't have at least one character after the escape | |||
| 388 | ## then this can't be a character escape sequence | |||
| 389 | return $left if ($left+1 >= $right); | |||
| 390 | ||||
| 391 | ## pull off the first escape | |||
| 392 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | |||
| 393 | ||||
| 394 | ## the first method of escaping to small character sets | |||
| 395 | if ( $esc_char_1 eq GREEK_SYMBOLS | |||
| 396 | or $esc_char_1 eq SUBSCRIPTS | |||
| 397 | or $esc_char_1 eq SUPERSCRIPTS | |||
| 398 | or $esc_char_1 eq ASCII_DEFAULT) | |||
| 399 | { | |||
| 400 | $G0 = $esc_char_1; | |||
| 401 | return $left+2; | |||
| 402 | } | |||
| 403 | ||||
| 404 | ## the second more complicated method of escaping to bigger charsets | |||
| 405 | return $left if $left+2 >= $right; | |||
| 406 | ||||
| 407 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | |||
| 408 | my $esc_chars = $esc_char_1 . $esc_char_2; | |||
| 409 | ||||
| 410 | if ($esc_char_1 eq SINGLE_G0_A | |||
| 411 | or $esc_char_1 eq SINGLE_G0_B) | |||
| 412 | { | |||
| 413 | $G0 = $esc_char_2; | |||
| 414 | return $left+3; | |||
| 415 | } | |||
| 416 | ||||
| 417 | elsif ($esc_char_1 eq SINGLE_G1_A | |||
| 418 | or $esc_char_1 eq SINGLE_G1_B) | |||
| 419 | { | |||
| 420 | $G1 = $esc_char_2; | |||
| 421 | return $left+3; | |||
| 422 | } | |||
| 423 | ||||
| 424 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | |||
| 425 | $G0 = $esc_char_2; | |||
| 426 | return $left+3; | |||
| 427 | } | |||
| 428 | ||||
| 429 | elsif ($esc_chars eq MULTI_G0_B | |||
| 430 | and ($left+3 < $right)) | |||
| 431 | { | |||
| 432 | $G0 = substr($$str_ref, $left+3, 1); | |||
| 433 | return $left+4; | |||
| 434 | } | |||
| 435 | ||||
| 436 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | |||
| 437 | and ($left + 3 < $right)) | |||
| 438 | { | |||
| 439 | $G1 = substr($$str_ref, $left+3, 1); | |||
| 440 | return $left+4; | |||
| 441 | } | |||
| 442 | ||||
| 443 | # we should never get here | |||
| 444 | warn("seem to have fallen through in _process_escape()"); | |||
| 445 | return $left; | |||
| 446 | } | |||
| 447 | ||||
| 448 | sub reset_charsets | |||
| 449 | { | |||
| 450 | $G0 = $DEFAULT_G0; | |||
| 451 | $G1 = $DEFAULT_G1; | |||
| 452 | } | |||
| 453 | ||||
| 454 | 1 | 12µs | 12µs | 1; |