import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
try:
from mrrc import Record, Field, Leader
except ImportError:
print("Error: mrrc not installed")
print("Install with: pip install mrrc")
sys.exit(1)
def explain_character_encoding():
print("\n" + "=" * 70)
print("1. CHARACTER ENCODING BASICS")
print("=" * 70 + "\n")
print("MARC records support two character encodings:\n")
print("MARC-8 (Legacy):")
print(" - Indicator: Space character (' ') in Leader position 9")
print(" - Uses ISO 2022 escape sequences")
print(" - Supports: Latin, Greek, Cyrillic, Arabic, Hebrew, CJK")
print(" - More compact but complex (escape sequences)")
print(" - Example escape: ESC ) 2 = Switch to Hebrew")
print(" - Common in: older catalog records, legacy systems")
print()
print("UTF-8 (Modern):")
print(" - Indicator: 'a' in Leader position 9")
print(" - Direct Unicode representation")
print(" - Supports: All Unicode characters")
print(" - Simpler, more flexible")
print(" - Natively supported by modern systems")
print(" - Common in: modern systems, web services")
print()
def detect_encoding_from_leader():
print("\n" + "=" * 70)
print("2. DETECTING ENCODING FROM LEADER")
print("=" * 70 + "\n")
print("Example 1: MARC-8 Encoded Record")
leader_marc8 = Leader(
record_type='a',
bibliographic_level='m',
character_coding=' ', )
print(f" Leader position 9: '{leader_marc8.character_coding}'")
print(f" Encoding: MARC-8 (escape sequences)")
print(f" How to handle: Use MARC-8 decoder to process content")
print()
print("Example 2: UTF-8 Encoded Record")
leader_utf8 = Leader(
record_type='a',
bibliographic_level='m',
character_coding='a', )
print(f" Leader position 9: '{leader_utf8.character_coding}'")
print(f" Encoding: UTF-8")
print(f" How to handle: Direct Unicode processing, no decoding needed")
print()
def create_utf8_encoded_record():
print("\n" + "=" * 70)
print("3. UTF-8 ENCODED MULTILINGUAL RECORD")
print("=" * 70 + "\n")
leader = Leader(
record_type='a',
bibliographic_level='m',
character_coding='a', )
record = Record(leader)
record.add_control_field('001', 'ocm987654321')
record.add_control_field('008', '210115s2021 xx||||||||||||||||eng||')
title = Field('245', '1', '0')
title.add_subfield('a', 'Learning world languages /')
title.add_subfield('c', 'Multiple authors.')
record.add_field(title)
author = Field('100', '1', ' ')
author.add_subfield('a', 'Smith, John,')
author.add_subfield('e', 'author.')
record.add_field(author)
subject1 = Field('650', ' ', '0')
subject1.add_subfield('a', 'Hebrew language')
subject1.add_subfield('x', 'Study and teaching.')
record.add_field(subject1)
subject2 = Field('650', ' ', '0')
subject2.add_subfield('a', 'Arabic language')
subject2.add_subfield('x', 'Grammar.')
record.add_field(subject2)
language_note = Field('546', ' ', ' ')
language_note.add_subfield('a', 'Text in English; includes Hebrew, Arabic, and Cyrillic examples.')
record.add_field(language_note)
hebrew_title = Field('880', '1', '0')
hebrew_title.add_subfield('a', 'למידת שפות עולם /') hebrew_title.add_subfield('6', '245-01') record.add_field(hebrew_title)
print(f"Record Encoding: UTF-8")
print(f"Title: {record.title}")
print(f"Author: {record.author}")
print()
print("Language Information:")
if '546' in record:
note = record['546'].get_subfield('a')
if note:
print(f" {note}")
print()
print("Variant Forms (880 fields):")
for field in record.get_fields('880'):
variant_a = field.get_subfield('a')
link = field.get_subfield('6')
if variant_a:
print(f" Script variant: {variant_a}")
print(f" Links to: field {link}")
print()
def marc8_character_sets():
print("\n" + "=" * 70)
print("4. MARC-8 CHARACTER SETS")
print("=" * 70 + "\n")
print("MARC-8 uses escape sequences to switch between character sets:\n")
character_sets = [
("ASCII (G0)", "ESC ( B", "Basic Latin characters"),
("Extended Latin", "ESC ( S", "Accented Latin letters"),
("Greek", "ESC ( G", "Greek alphabet"),
("Arabic", "ESC ) 2", "Arabic script"),
("Hebrew", "ESC ) 4", "Hebrew script"),
("Cyrillic", "ESC ( N", "Cyrillic alphabet"),
("CJK", "ESC $ ) C", "Chinese, Japanese, Korean"),
]
for name, escape, description in character_sets:
print(f" {name}")
print(f" Escape sequence: {escape}")
print(f" Content: {description}")
print()
print("How MARC-8 works:")
print(" 1. Default: ASCII characters (0x20-0x7E)")
print(" 2. Encounter non-ASCII: Check escape sequence prefix")
print(" 3. Switch: Change character interpretation based on set")
print(" 4. Decode: Apply character mapping for active set")
print(" 5. Switch back: ESC ( B returns to ASCII")
print()
def practical_encoding_decisions():
print("\n" + "=" * 70)
print("5. PRACTICAL ENCODING DECISIONS")
print("=" * 70 + "\n")
print("USE MARC-8 WHEN:")
print(" ✓ Working with legacy library systems")
print(" ✓ Preserving historical records")
print(" ✓ Maintaining backward compatibility")
print(" ✓ Working with Library of Congress records (historical)")
print()
print("USE UTF-8 WHEN:")
print(" ✓ Building new systems")
print(" ✓ Web-based MARC delivery")
print(" ✓ Modern library platforms")
print(" ✓ Multilingual collections (simplifies handling)")
print(" ✓ Mobile or API-first applications")
print()
print("MIGRATION TIPS:")
print(" 1. Detect encoding from Leader position 9")
print(" 2. Use appropriate decoder (MARC-8 decoder or UTF-8)")
print(" 3. Validate round-trip conversion")
print(" 4. Test with multilingual content")
print(" 5. Preserve variant forms (880 fields) during conversion")
print()
print("HANDLING MULTILINGUAL RECORDS:")
print(" 1. Language in 546 field documents languages present")
print(" 2. 880 field variants show same content in different scripts")
print(" 3. Script code in $6 subfield links variant to main field")
print(" 4. Language code in 041 field documents languages")
print()
def main():
print("\n" + "=" * 70)
print("MRRC: MARC-8 Encoding Support (Python/pymarc-compatible)")
print("=" * 70)
explain_character_encoding()
detect_encoding_from_leader()
create_utf8_encoded_record()
marc8_character_sets()
practical_encoding_decisions()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print("""
MRRC automatically detects and handles both MARC-8 and UTF-8 encoding:
1. MARC-8 records use escape sequences for non-ASCII characters
2. UTF-8 records use direct Unicode representation
3. Both are equally supported by mrrc
4. Leader position 9 indicates which encoding is used
5. Modern systems recommend UTF-8 for simplicity
When working with MARC records:
- Don't worry about encoding internally (mrrc handles it)
- Ensure your system supports the encoding before import
- For modern systems, prefer UTF-8
- Test multilingual content thoroughly
- Use 880 fields for script variants in MARC-8 records
REFERENCE:
- Library of Congress Character Encoding Guidelines:
https://www.loc.gov/marc/specifications/spechome.html
- MARC-8 to UTF-8 Conversion:
https://www.loc.gov/standards/marcxml/
""")
print()
if __name__ == '__main__':
main()