diff --git a/data/citations.csv b/data/citations.csv new file mode 100644 index 0000000..b307eba --- /dev/null +++ b/data/citations.csv @@ -0,0 +1,13 @@ +Authors,Title,Publication,Volume,Number,Pages,Year,Publisher +"Gaizauskas, Robert; Wilks, Yorick; ",Information extraction: Beyond document retrieval,Journal of documentation,54,1,70-105,1998,MCB UP Ltd +"Morgan, Paul; ",Hypertext and the literary document,Journal of documentation,47,4,373-388,1991,MCB UP Ltd +"Kircz, Joost G; ",Modularity: the next form of scientific information presentation?,Journal of documentation,54,2,210-235,1998,MCB UP Ltd +"Farrow, John F; ",A cognitive process model of document indexing,Journal of documentation,47,2,149-166,1991,MCB UP Ltd +"Heery, Rachel; ",Review of metadata formats,Program,30,4,345-373,1996,MCB UP Ltd +"Burnett, Kathleen; Ng, Kwong Bor; Park, Soyeon; ",A comparison of the two traditions of metadata development,Journal of the American Society for Information Science,50,13,1209-1217,1999,Wiley Online Library +"Weibel, Stuart; Godby, Jean; Miller, Eric; Daniel, Ron; ",OCLC/NCSA metadata workshop report,,,,,1995, +"Schenkman, Bo N; Jönsson, Fredrik U; ",Aesthetics and preferences of web pages,Behaviour & information technology,19,5,367-377,2000,Taylor & Francis +"Lakoff, George; ",Explaining embodied cognition results,Topics in cognitive science,4,4,773-785,2012,Wiley Online Library +"McManus, I Chris; Wu, Wen; ","“The square is… bulky, heavy, contented, plain, good-natured, stupid…”: A cross-cultural study of the aesthetics and meanings of rectangles.","Psychology of Aesthetics, Creativity, and the Arts",7,2,130,2013,Educational Publishing Foundation +"Lakoff, George; Johnson, Mark; ",The metaphorical structure of the human conceptual system,Cognitive science,4,2,195-208,1980,No longer published by Elsevier +"Furuta, Richard; Stotts, P David; ",Specifying structured document transformations,Document Manipulation and typography,,,109-120,1988,Cambridge University Press Cambridge diff --git a/notebooks/2024-11-12.ipynb b/notebooks/2024-11-12.ipynb new file mode 100644 index 0000000..429e2b2 --- /dev/null +++ b/notebooks/2024-11-12.ipynb @@ -0,0 +1,680 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e05c9332-dd32-4c12-b171-863fdc6f4da0", + "metadata": {}, + "outputs": [], + "source": [ + "import pymarc\n", + "import os\n", + "\n", + "from openai import OpenAI\n", + "\n", + "open_ai = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43ec74df-823c-4e45-986d-c145686bfc3e", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"ocn999379620.mrc\", \"rb\") as fo:\n", + " marc_records = [rec for rec in pymarc.MARCReader(fo)]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b2189b2f-7f1e-484e-948b-8a2dc10a8ce8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(marc_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "30b8fd7b-93e5-410a-8203-30caf3c05e5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=LDR 01196cam a2200361Mi 4500\n", + "=001 ocn999379620\n", + "=003 OCoLC\n", + "=005 20230925021221.0\n", + "=008 160725s2016\\\\\\\\enka\\\\\\\\\\b\\\\\\\\000\\0\\eng\\d\n", + "=040 \\\\$aUKTTE$beng$cUKTTE$dOCLCQ\n", + "=019 \\\\$a994235819\n", + "=020 \\\\$a9780993475542\n", + "=020 \\\\$a099347554X\n", + "=035 \\\\$a(OCoLC)999379620$z(OCoLC)994235819\n", + "=080 \\\\$aLI\n", + "=049 \\\\$aSTFA\n", + "=245 00$aLi Huasheng :$bthe meditation room.\n", + "=246 30$aMeditation room\n", + "=260 \\\\$aLondon :$bMayor Gallery,$c2016.\n", + "=300 \\\\$a54 pages :$billustrations (some color) ;$c21 x 26 cm +$e1 loose sheet\n", + "=336 \\\\$atext$btxt$2rdacontent\n", + "=337 \\\\$aunmediated$bn$2rdamedia\n", + "=338 \\\\$avolume$bnc$2rdacarrier\n", + "=500 \\\\$aBiog. ; exhibs. ; colls.\n", + "=500 \\\\$aIntroduction by Nataline Colonnello.\n", + "=500 \\\\$aIncludes price list.\n", + "=500 \\\\$aPublished in an edition of 500 copies.\n", + "=504 \\\\$aIncludes bibliographical references.\n", + "=518 \\\\$aPublished on the occasion of the exhibition held at the Mayor Gallery, London, 2 June - 28 July 2016.\n", + "=600 14$aLi, Huasheng,$d1944-\n", + "=700 1\\$aColonnello, Nataline.\n", + "=710 2\\$aMayor Gallery (London)\n", + "=994 \\\\$aC0$bSTF\n", + "\n" + ] + } + ], + "source": [ + "print(marc_records[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7e88544d-dda8-4e30-9575-89334ee1d40a", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "71db24fc-22c9-44ab-83e7-63f663f92e2d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "28f64cb5-9f9a-4d40-9be6-8439617f99c9", + "metadata": {}, + "outputs": [], + "source": [ + "chat_completion = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"How would I extract a 245 MARC field from a record using pymarc?\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "589f678f-8953-4aaf-babb-4e1c1bb78386", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatCompletion(id='chatcmpl-ASnrgP1lBtQo96wR0HzZBYa465bxP', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='To extract a 245 MARC field from a record using the `pymarc` library in Python, you first need to parse the MARC record and then access that specific field. Here\\'s a step-by-step guide on how to do it:\\n\\n1. **Install `pymarc`:** If you haven\\'t already installed pymarc, you can do so using pip:\\n ```bash\\n pip install pymarc\\n ```\\n\\n2. **Read the MARC Record:** You’ll typically use a `MARCReader` or `parse_xml_to_array` if you are working with MARC21 or MARCXML files respectively.\\n\\n3. **Extract the 245 Field:** Once you have the MARC record, you can access the 245 field.\\n\\nHere\\'s a sample code demonstrating this process:\\n\\n```python\\nfrom pymarc import MARCReader\\n\\n# Open the MARC file in read-binary mode\\nwith open(\\'yourfile.mrc\\', \\'rb\\') as file:\\n reader = MARCReader(file)\\n\\n # Iterate through each record\\n for record in reader:\\n # Access the 245 field\\n field_245 = record[\\'245\\']\\n \\n # Check if the field exists\\n if field_245 is not None:\\n # Extract specific subfields (e.g., \\'a\\' for the title)\\n title = field_245[\\'a\\']\\n subtitle = field_245[\\'b\\'] if \\'b\\' in field_245 else \\'\\'\\n\\n print(f\"Title: {title}\")\\n if subtitle:\\n print(f\"Subtitle: {subtitle}\")\\n else:\\n print(\"245 field not found in this record.\")\\n```\\n\\n### Explanation:\\n\\n- **MARCReader:** Used to read MARC21 binary files.\\n- **record[\\'245\\']:** Accesses the 245 field if it exists.\\n- **field_245[\\'a\\'] and field_245[\\'b\\']:** Access subfields \\'a\\' (main title) and \\'b\\' (remainder of title), checking if they exist.\\n\\nMake sure to replace `\\'yourfile.mrc\\'` with the actual path to your MARC file.\\n\\nThis code snippet will extract and print the main title and subtitle (if available) of each record in the MARC file. Adjust the subfields you need based on your specific requirements.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1731428228, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_159d8341cc', usage=CompletionUsage(completion_tokens=472, prompt_tokens=24, total_tokens=496, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chat_completion" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ec7aea76-f6a1-45af-9741-17fbed0a6432", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To extract a 245 MARC field from a record using the `pymarc` library in Python, you first need to parse the MARC record and then access that specific field. Here's a step-by-step guide on how to do it:\n", + "\n", + "1. **Install `pymarc`:** If you haven't already installed pymarc, you can do so using pip:\n", + " ```bash\n", + " pip install pymarc\n", + " ```\n", + "\n", + "2. **Read the MARC Record:** You’ll typically use a `MARCReader` or `parse_xml_to_array` if you are working with MARC21 or MARCXML files respectively.\n", + "\n", + "3. **Extract the 245 Field:** Once you have the MARC record, you can access the 245 field.\n", + "\n", + "Here's a sample code demonstrating this process:\n", + "\n", + "```python\n", + "from pymarc import MARCReader\n", + "\n", + "# Open the MARC file in read-binary mode\n", + "with open('yourfile.mrc', 'rb') as file:\n", + " reader = MARCReader(file)\n", + "\n", + " # Iterate through each record\n", + " for record in reader:\n", + " # Access the 245 field\n", + " field_245 = record['245']\n", + " \n", + " # Check if the field exists\n", + " if field_245 is not None:\n", + " # Extract specific subfields (e.g., 'a' for the title)\n", + " title = field_245['a']\n", + " subtitle = field_245['b'] if 'b' in field_245 else ''\n", + "\n", + " print(f\"Title: {title}\")\n", + " if subtitle:\n", + " print(f\"Subtitle: {subtitle}\")\n", + " else:\n", + " print(\"245 field not found in this record.\")\n", + "```\n", + "\n", + "### Explanation:\n", + "\n", + "- **MARCReader:** Used to read MARC21 binary files.\n", + "- **record['245']:** Accesses the 245 field if it exists.\n", + "- **field_245['a'] and field_245['b']:** Access subfields 'a' (main title) and 'b' (remainder of title), checking if they exist.\n", + "\n", + "Make sure to replace `'yourfile.mrc'` with the actual path to your MARC file.\n", + "\n", + "This code snippet will extract and print the main title and subtitle (if available) of each record in the MARC file. Adjust the subfields you need based on your specific requirements.\n" + ] + } + ], + "source": [ + "print(chat_completion.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "43f4bdf9-44d8-4a71-bedf-c0d8855987eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Li Huasheng :'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "marc_records[0][\"245\"][\"a\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b4c6cfa8-2cea-4921-9f4b-2965d92fd4cf", + "metadata": {}, + "outputs": [], + "source": [ + "create_marc_record = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Create a MARC 21 record with a title of Moby Dick by Herman Meville\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7ca1b693-b0f3-4205-9d49-4877b2880428", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating a MARC 21 record involves using a standardized format for the description and cataloging of items in a library catalog. Below is an example of a simplified MARC 21 bibliographic record for *Moby Dick* by Herman Melville.\n", + "\n", + "```\n", + "=LDR 00000cam a2200000 a 4500\n", + "=001 000000001\n", + "=005 20231010123000.0\n", + "=008 231010s1851 mau 000 1 eng \n", + "=020 \\\\$a1234567890\n", + "=040 \\\\$aYourLibraryCode$beng$cYourLibraryCode\n", + "=041 0\\$aeng\n", + "=100 1\\$aMelville, Herman,$d1819-1891.\n", + "=245 10$aMoby Dick /$cby Herman Melville.\n", + "=260 \\\\$aBoston :$bHarper & Brothers,$c1851.\n", + "=300 \\\\$axx, 635 p. ;$c21 cm.\n", + "=336 \\\\$atext$btxt$2rdacontent\n", + "=337 \\\\$aunmediated$bn$2rdamedia\n", + "=338 \\\\$avolume$bnc$2rdacarrier\n", + "=500 \\\\$aOriginally published in 1851.\n", + "=520 \\\\$aAn allegorical novel about the voyage of the whaling ship Pequod and its captain, Ahab, who is intent on pursuing the giant white whale Moby Dick.\n", + "=650 \\0$aWhaling ships$vFiction.\n", + "=650 \\0$aSea stories.\n", + "=650 \\0$aWhales$vFiction.\n", + "=700 1\\$aMelville, Herman,$d1819-1891.\n", + "=852 \\\\$aYour Library Location\n", + "```\n", + "\n", + "### Explanation of some fields:\n", + "- **LDR (Leader):** Holds information for processing the record.\n", + "- **001:** Control number assigned by the institution.\n", + "- **005:** Date and time of the latest transaction.\n", + "- **008:** Fixed-length data elements, include date of publication, language, etc.\n", + "- **020:** ISBN (if applicable, otherwise can be omitted or replaced with an alternate identifier).\n", + "- **040:** Cataloging source, consists of library code and language of cataloging.\n", + "- **041:** Language code.\n", + "- **100:** Main entry - Personal name (author).\n", + "- **245:** Title statement.\n", + "- **260:** Publication information.\n", + "- **300:** Physical description of the item.\n", + "- **336, 337, 338:** RDA content, media, and carrier types.\n", + "- **500:** General note, here noting the original publication year.\n", + "- **520:** Summary or abstract of the work.\n", + "- **650:** Subject added entry - Topical term.\n", + "- **700:** Added entry for author (used if there is a collaboration, editors, etc., not necessary here, repeated in error).\n", + "- **852:** Location information in the library.\n", + "\n", + "This example is a basic outline and may need adjustments based on library-specific practices or cataloging rules.\n" + ] + } + ], + "source": [ + "print(create_marc_record.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "dd05dd87-2b46-4a43-8162-942b1661a361", + "metadata": {}, + "outputs": [], + "source": [ + "csv_string = \"Title,Author,Copyright\\nPride and Prejudice,Jane Austen,1818\\nMoby Dick,Herman Meville,1851\\nAdventure of Tom Sawyer,Mark Twain,1876\"" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "93467eb8-4d35-4b20-a9cd-f54b9eea7f7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Title,Author,Copyright\n", + "Pride and Prejudice,Jane Austen,1818\n", + "Moby Dick,Herman Meville,1851\n", + "Adventure of Tom Sawyer,Mark Twain,1876\n" + ] + } + ], + "source": [ + "print(csv_string)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "300b00bb-3d83-48ad-a40a-aca22d8e7330", + "metadata": {}, + "outputs": [], + "source": [ + "create_marc_records = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Create a MARC 21 record for each in this CSV file:\\n{csv_string}\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ccd18a3d-b15a-45a4-b007-3f0ad8563571", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating MARC 21 records involves formatting bibliographic data into fields and subfields according to the MARC standard. Below, you'll find MARC 21 records for each of the entries provided in the CSV file.\n", + "\n", + "### Record for \"Pride and Prejudice\" by Jane Austen\n", + "\n", + "```\n", + "=LDR 00000nam a2200000 a 4500\n", + "=245 10$aPride and Prejudice /$cJane Austen.\n", + "=100 1\\$aAusten, Jane.\n", + "=260 \\\\$c1818.\n", + "=300 \\\\$apages cm.\n", + "```\n", + "\n", + "### Record for \"Moby Dick\" by Herman Melville\n", + "\n", + "```\n", + "=LDR 00000nam a2200000 a 4500\n", + "=245 10$aMoby Dick /$cHerman Melville.\n", + "=100 1\\$aMelville, Herman.\n", + "=260 \\\\$c1851.\n", + "=300 \\\\$apages cm.\n", + "```\n", + "\n", + "### Record for \"Adventure of Tom Sawyer\" by Mark Twain\n", + "\n", + "```\n", + "=LDR 00000nam a2200000 a 4500\n", + "=245 10$aAdventure of Tom Sawyer /$cMark Twain.\n", + "=100 1\\$aTwain, Mark.\n", + "=260 \\\\$c1876.\n", + "=300 \\\\$apages cm.\n", + "```\n", + "\n", + "### Explanation:\n", + "\n", + "- `LDR` (Leader): This is a mandatory field that provides information for the processing of the record. The values are placeholders and should be replaced with appropriate values by library systems.\n", + "- `245` (Title Statement): Contains the title and the statement of responsibility (author).\n", + " - `$a`: Title.\n", + " - `$c`: Statement of responsibility (author).\n", + "- `100` (Main Entry—Personal Name): Provides the name of the author. \n", + " - The last number `1\\` indicates main entry.\n", + " - `\\$a`: Personal name.\n", + "- `260` (Publication, Distribution, etc.):\n", + " - `\\$c`: Date of publication.\n", + "- `300` (Physical Description): This field is typically used to describe the extent and dimensions. Here, \"pages cm\" is a placeholder for actual pagination which should be filled per the specific edition's details.\n", + "\n", + "Note: In a real library system, each of these records might include additional fields such as publisher information, ISBNs (if applicable for newer editions), subjects, and other metadata necessary for library cataloging.\n" + ] + } + ], + "source": [ + "print(create_marc_records.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "1411ec5f-8be2-4d29-999b-ab5161f6d346", + "metadata": {}, + "outputs": [], + "source": [ + "create_marc_records = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Create a MARC 21 record with ordered fields for each row in this CSV file:\\n{csv_string}\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "cf39ce51-93d4-4aa4-8e58-bab33f41e930", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Below are the MARC 21 records for each of the entries from the CSV file. In a MARC 21 record, the fields are typically ordered using tags such as 100 for the author, 245 for the title, and 260 for publication details (in this case, using 'c' for copyright year).\n", + "\n", + "Record 1:\n", + "```\n", + "=100 1\\$aAusten, Jane.\n", + "=245 10$aPride and Prejudice /$cJane Austen.\n", + "=260 \\\\\\$cc1818.\n", + "```\n", + "\n", + "Record 2:\n", + "```\n", + "=100 1\\$aMelville, Herman.\n", + "=245 10$aMoby Dick /$cHerman Melville.\n", + "=260 \\\\\\$cc1851.\n", + "```\n", + "\n", + "Record 3:\n", + "```\n", + "=100 1\\$aTwain, Mark.\n", + "=245 10$aAdventure of Tom Sawyer /$cMark Twain.\n", + "=260 \\\\\\$cc1876.\n", + "```\n", + "\n", + "Explanation of MARC fields:\n", + "- `100`: Main Entry—Personal Name, showing the primary responsibility (author) for the work.\n", + "- `245`: Title Statement, where the title and statement of responsibility (author's name) are provided.\n", + "- `260`: Publication, Distribution, etc., providing information related to the publication, including the copyright date (represented with subfield `$c`).\n" + ] + } + ], + "source": [ + "print(create_marc_records.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "40de384c-502d-44a4-8f76-00d96c3b8110", + "metadata": {}, + "outputs": [], + "source": [ + "spanish_marc_record = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Create a MARC 21 record for book Cien años de soledad\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5faa221d-60af-440a-97ba-feb871223eac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating a MARC 21 record for a book involves capturing bibliographic information in a standardized format used by libraries. Below is a simplified example of a MARC 21 record for \"Cien años de soledad\" by Gabriel García Márquez. Note that real MARC records may include additional fields based on the library's cataloging standards and specific edition details.\n", + "\n", + "```plaintext\n", + "LDR 00000nam 2200000 a 4500\n", + "001 123456789\n", + "005 20231025120000.0\n", + "008 991001s1967 xx 000 1 spa d\n", + "010 ##$a 12345678\n", + "020 ##$a 9780307474728\n", + "040 ##$a DLC$b eng$c DLC\n", + "100 1#$a García Márquez, Gabriel,$d 1927-2014.\n", + "240 10$a Cien años de soledad.\n", + "245 10$a Cien años de soledad /$c Gabriel García Márquez.\n", + "250 $a Primera edición.\n", + "260 $a Buenos Aires :$b Editorial Sudamericana,$c 1967.\n", + "300 $a 471 p. ;$c 21 cm.\n", + "490 1#$a Primera edición ;$v 1\n", + "500 $a \"Premio Nobel de Literatura, 1982.\"\n", + "500 $a \"Cien años de soledad fue publicada en 1967.\"\n", + "520 $a La historia de la familia Buendía, situada en el pueblo ficticio de Macondo.\n", + "650 #0$a Magic realism (Literature)\n", + "650 #0$a Buendía, José Arcadio (Fictitious character)$v Fiction.\n", + "650 #0$a Macondo (Imaginary place)$v Fiction.\n", + "830 0$a Primera edición ;$v 1\n", + "856 42$3Contributor biographical information$u http://www.linktobiography.com\n", + "856 42$3Publisher description$u http://www.linktopublisherdescription.com\n", + "```\n", + "\n", + "### Field Breakdown:\n", + "- **LDR (Leader):** Indicates the type of material and bibliographic level.\n", + "- **001:** Control number (unique identifier for the record).\n", + "- **005:** Date and time of latest transaction.\n", + "- **008:** Fixed length data elements (publication date, language, etc.).\n", + "- **010:** Library of Congress Control Number (LCCN).\n", + "- **020:** ISBN.\n", + "- **040:** Cataloging source.\n", + "- **100:** Main entry (personal name – author).\n", + "- **240:** Uniform Title.\n", + "- **245:** Title statement (includes the title and statement of responsibility).\n", + "- **250:** Edition statement.\n", + "- **260:** Publication information.\n", + "- **300:** Physical description.\n", + "- **490/830:** Series statement and information.\n", + "- **500:** General notes.\n", + "- **520:** Summary note.\n", + "- **650:** Subject added entry – topical term.\n", + "- **856:** Electronic location and access, if applicable.\n", + "\n", + "Note: Specific details (such as URLs in field 856) and field contents may vary based on the specific library system and edition of the book being cataloged. Also, some fields may not be used for all types of records or may have different configurations in a live setting.\n" + ] + } + ], + "source": [ + "print(spanish_marc_record.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1ed1d804-ff89-4278-a0eb-ebcc4bf48f18", + "metadata": {}, + "outputs": [], + "source": [ + "marc_record = open_ai.chat.completions.create(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Create a MARC 21 record for book Two hundred años de soledad\",\n", + " }\n", + " ],\n", + " model=\"gpt-4o\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "a75ee54d-b7fc-479c-b994-3176fe520226", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Certainly! The MARC 21 format is a standard for the representation and communication of bibliographic and related information in machine-readable form. Below is a MARC 21 record example for a fictional book titled \"Two Hundred Años de Soledad\" (a play on Gabriel García Márquez's \"One Hundred Years of Solitude\", assuming it's a similar work in terms of bibliographic elements).\n", + "\n", + "```plaintext\n", + "=Leader 00000nam a2200000 i 4500\n", + "=001 123456789\n", + "=005 20231012120000.0\n", + "=008 231012s2023 xxu 000 1 spa d\n", + "=020 \\\\$a9781234567890\n", + "=040 \\\\$aXYZ$cXYZ$eIFN\n", + "=041 1 \\$aspa$heng\n", + "=100 1\\$aDoe, Juan.\n", + "=245 10$aTwo hundred años de soledad /$cJuan Doe.\n", + "=246 3\\$a200 años de soledad\n", + "=250 \\$aPrimera edición.\n", + "=260 \\$aCiudad Inexistente :$bEditorial Fantasía,$c2023.\n", + "=300 \\$a350 páginas ;$c23 cm.\n", + "=336 \\$atexto$btxt$2rdacontent\n", + "=337 \\$asin mediado$bn$2rdamedia\n", + "=338 \\$avolumen$bnc$2rdacarrier\n", + "=500 \\\\$aObra de ficción.\n", + "=504 \\\\$aIncluye referencias bibliográficas.\n", + "=520 \\\\$aEsta novela narra la historia de la familia ficticia y sus encuentros mágicos en el pueblo de Macondo a lo largo de doscientos años.\n", + "=650 \\0$aFicción mágica.\n", + "=650 \\0$aLiteratura hispanoamericana.\n", + "=700 1\\$aPérez, María,$etraductor.\n", + "=852 \\\\$aBiblioteca de Ejemplo\n", + "```\n", + "\n", + "**Notes:**\n", + "\n", + "- Fields `100`, `245`, `250`, `260`, etc., represent different parts of the bibliographic description. For example:\n", + " - `100` is the main author.\n", + " - `245` is the title statement.\n", + " - `260` is the publication information.\n", + " - `650` is the subject heading.\n", + "- The content in fields `a`, `b`, `c`, etc., represents subfields which contain specific pieces of information.\n", + "- This record is an example and assumes fictional details (e.g., author name \"Juan Doe\", publisher \"Editorial Fantasía\"). You should replace them with actual bibliographic information as needed.\n" + ] + } + ], + "source": [ + "print(marc_record.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f601e30d-c841-462a-810f-a670f339f034", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/2024-12-10.ipynb b/notebooks/2024-12-10.ipynb new file mode 100644 index 0000000..8260d20 --- /dev/null +++ b/notebooks/2024-12-10.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c6890bb6-5062-4fc2-bb81-e3571c633ec6", + "metadata": {}, + "source": [ + "# Hackathon using Linked Data with quick statements in wikidata\n", + "\n", + "- Wikidata [Crade](https://www.wikidata.org/wiki/Wikidata:Cradle#article)\n", + "- [QuickStatements UI](https://quickstatements.toolforge.org/)\n", + "- QuickStatements [API](https://www.wikidata.org/wiki/Help:QuickStatements#Using_the_API_to_start_batches)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e8037a5-542c-4a11-9adb-e4b416e30f2d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import httpx\n", + "import openai\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac60f124-7ff5-4716-a7f3-5f06d4e32173", + "metadata": {}, + "outputs": [], + "source": [ + "client = openai.OpenAI(\n", + " api_key=os.environ.get(\"OPENAI_API_KEY\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c9ac12c-373e-4675-ad78-918c8a4b6723", + "metadata": {}, + "outputs": [], + "source": [ + "citations = pd.read_csv(\"../data/citations.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5e6ad5e-5b88-4a69-98fc-1e10bdd0d662", + "metadata": {}, + "outputs": [], + "source": [ + "citations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59963d06-ebae-4edd-93a0-f93855def04b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}