Upload New File

8137f41e · 前钰 · 78903dab · 8137f41e
Commit 8137f41e authored Jan 12, 2024 by 前钰
Hide whitespace changes
Inline Side-by-side

Showing with 337 additions and 0 deletions

NPL2024_Lesson3.ipynb ...战第三期/实战代码/深度学习项目实战/NLP中的基础工具集和常用数据集/NPL2024_Lesson3.ipynb +337 -0

No files found.
--- a/人工智能系统实战第三期/实战代码/深度学习项目实战/NLP中的基础工具集和常用数据集/NPL2024_Lesson3.ipynb
+++ b/人工智能系统实战第三期/实战代码/深度学习项目实战/NLP中的基础工具集和常用数据集/NPL2024_Lesson3.ipynb
+{
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LiEWHmQLPDTz"
+      },
+      "outputs": [],
+      "source": [
+        "import nltk\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.corpus import stopwords\n",
+        "stopwords.words('english')"
+      ],
+      "metadata": {
+        "id": "6K_MmzG_PW63"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.corpus import gutenberg\n",
+        "# gutenberg.fileids()\n",
+        "gutenberg.raw('austen-emma.txt')"
+      ],
+      "metadata": {
+        "id": "sDUZYH75Pmvw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.corpus import sentence_polarity\n",
+        "sentence_polarity.sents()"
+      ],
+      "metadata": {
+        "id": "D0-wTA_9RWDj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "[(sentence, category) for category in sentence_polarity.categories() for sentence in sentence_polarity.sents(categories=category)]"
+      ],
+      "metadata": {
+        "id": "mt1QgVClSEjA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.corpus import wordnet\n",
+        "sysns = wordnet.synsets('dog')\n",
+        "sysns"
+      ],
+      "metadata": {
+        "id": "IjSHEZScSQCF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title\n",
+        "sysns[0].definition()"
+      ],
+      "metadata": {
+        "id": "GgvX-BgHS81L"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sysns[0].examples()"
+      ],
+      "metadata": {
+        "id": "AT_3oaaHVLC1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sysns[0].lemmas()"
+      ],
+      "metadata": {
+        "id": "kmk7qIobVMal"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sysns[0].hypernyms()"
+      ],
+      "metadata": {
+        "id": "oETa7kbkVOa_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dog = wordnet.synset('dog.n.01')\n",
+        "cat = wordnet.synset('cat.n.01')\n",
+        "dog.path_similarity(cat)"
+      ],
+      "metadata": {
+        "id": "pIfiuH0oVQxC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dog.wup_similarity(cat)"
+      ],
+      "metadata": {
+        "id": "SemuS9s9Vduy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.corpus import sentiwordnet as swn\n",
+        "good = swn.senti_synsets('good', 'n')\n",
+        "\n",
+        "posscore=0\n",
+        "negscore=0\n",
+        "for synst in good:\n",
+        "\n",
+        "    posscore=posscore+synst.pos_score()\n",
+        "    negscore=negscore+synst.neg_score()\n",
+        "\n",
+        "\n",
+        "print(posscore)\n",
+        "print(negscore)"
+      ],
+      "metadata": {
+        "id": "dngeFrS0VfvW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.tokenize import sent_tokenize\n",
+        "text = gutenberg.raw('austen-emma.txt')\n",
+        "sents = sent_tokenize(text)\n",
+        "sents[100]"
+      ],
+      "metadata": {
+        "id": "kfJJGbH-Vqrg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk.tokenize import word_tokenize\n",
+        "words = word_tokenize(sents[100])\n",
+        "words"
+      ],
+      "metadata": {
+        "id": "T36no6nTYO1P"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nltk import pos_tag\n",
+        "pos_tag(words)"
+      ],
+      "metadata": {
+        "id": "G5WXfMZVYZq5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# NER & # Chunking for nltk\n",
+        "from nltk.tokenize import word_tokenize\n",
+        "from nltk.tag import pos_tag\n",
+        "\n",
+        "ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'\n",
+        "\n",
+        "def preprocess(sent):\n",
+        "    sent = nltk.word_tokenize(sent)\n",
+        "    sent = nltk.pos_tag(sent)\n",
+        "    return sent\n",
+        "\n",
+        "sent = preprocess(ex)\n",
+        "sent\n",
+        "\n",
+        "pattern = 'NP: {<DT>?<JJ>*<NN>}'\n",
+        "\n",
+        "cp = nltk.RegexpParser(pattern)\n",
+        "cs = cp.parse(sent)\n",
+        "print(cs)\n",
+        "\n",
+        "from nltk.chunk import conlltags2tree, tree2conlltags\n",
+        "from pprint import pprint\n",
+        "iob_tagged = tree2conlltags(cs)\n",
+        "pprint(iob_tagged)\n",
+        "\n",
+        "ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))\n",
+        "print(ne_tree)"
+      ],
+      "metadata": {
+        "id": "3KLJcItyYjJL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install ltp"
+      ],
+      "metadata": {
+        "id": "LhcmXD4hY24z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from ltp import LTP\n",
+        "ltp = LTP() # 默认加载 LTP/Small 模型"
+      ],
+      "metadata": {
+        "id": "75fnbO_dhX9b"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from ltp import StnSplit\n",
+        "sents = StnSplit().split(\"汤姆生病了。他去了医院。\")"
+      ],
+      "metadata": {
+        "id": "syd9VHQtZ5ai"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sents = StnSplit().batch_split([\"老王怎么今天没来上班？\", \"老王得了甲流，他今天去医院了。\"])\n",
+        "sents"
+      ],
+      "metadata": {
+        "id": "UPfigeDOfKlD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "words = ltp.pipeline([\"老王怎么今天没来上班？\"], tasks = [\"cws\"], return_dict = False)\n",
+        "words"
+      ],
+      "metadata": {
+        "id": "jdY6gLZLfYgt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result = ltp.pipeline([\"老王怎么今天没来上班？\"], tasks = [\"cws\",\"pos\",\"ner\",\"srl\",\"dep\",\"sdp\",\"sdpg\"])\n",
+        "print(result.cws)\n",
+        "print(result.pos)\n",
+        "print(result.ner)\n",
+        "print(result.srl)\n",
+        "print(result.dep)\n",
+        "print(result.sdp)\n",
+        "print(result.sdpg)"
+      ],
+      "metadata": {
+        "id": "_ysoXWNkfsxt"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file