Commit 8137f41e by 前钰

Upload New File

parent 78903dab
{
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LiEWHmQLPDTz"
},
"outputs": [],
"source": [
"import nltk\n"
]
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import stopwords\n",
"stopwords.words('english')"
],
"metadata": {
"id": "6K_MmzG_PW63"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import gutenberg\n",
"# gutenberg.fileids()\n",
"gutenberg.raw('austen-emma.txt')"
],
"metadata": {
"id": "sDUZYH75Pmvw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import sentence_polarity\n",
"sentence_polarity.sents()"
],
"metadata": {
"id": "D0-wTA_9RWDj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"[(sentence, category) for category in sentence_polarity.categories() for sentence in sentence_polarity.sents(categories=category)]"
],
"metadata": {
"id": "mt1QgVClSEjA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import wordnet\n",
"sysns = wordnet.synsets('dog')\n",
"sysns"
],
"metadata": {
"id": "IjSHEZScSQCF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title\n",
"sysns[0].definition()"
],
"metadata": {
"id": "GgvX-BgHS81L"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sysns[0].examples()"
],
"metadata": {
"id": "AT_3oaaHVLC1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sysns[0].lemmas()"
],
"metadata": {
"id": "kmk7qIobVMal"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sysns[0].hypernyms()"
],
"metadata": {
"id": "oETa7kbkVOa_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dog = wordnet.synset('dog.n.01')\n",
"cat = wordnet.synset('cat.n.01')\n",
"dog.path_similarity(cat)"
],
"metadata": {
"id": "pIfiuH0oVQxC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dog.wup_similarity(cat)"
],
"metadata": {
"id": "SemuS9s9Vduy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import sentiwordnet as swn\n",
"good = swn.senti_synsets('good', 'n')\n",
"\n",
"posscore=0\n",
"negscore=0\n",
"for synst in good:\n",
"\n",
" posscore=posscore+synst.pos_score()\n",
" negscore=negscore+synst.neg_score()\n",
"\n",
"\n",
"print(posscore)\n",
"print(negscore)"
],
"metadata": {
"id": "dngeFrS0VfvW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.tokenize import sent_tokenize\n",
"text = gutenberg.raw('austen-emma.txt')\n",
"sents = sent_tokenize(text)\n",
"sents[100]"
],
"metadata": {
"id": "kfJJGbH-Vqrg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk.tokenize import word_tokenize\n",
"words = word_tokenize(sents[100])\n",
"words"
],
"metadata": {
"id": "T36no6nTYO1P"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from nltk import pos_tag\n",
"pos_tag(words)"
],
"metadata": {
"id": "G5WXfMZVYZq5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# NER & # Chunking for nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.tag import pos_tag\n",
"\n",
"ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'\n",
"\n",
"def preprocess(sent):\n",
" sent = nltk.word_tokenize(sent)\n",
" sent = nltk.pos_tag(sent)\n",
" return sent\n",
"\n",
"sent = preprocess(ex)\n",
"sent\n",
"\n",
"pattern = 'NP: {<DT>?<JJ>*<NN>}'\n",
"\n",
"cp = nltk.RegexpParser(pattern)\n",
"cs = cp.parse(sent)\n",
"print(cs)\n",
"\n",
"from nltk.chunk import conlltags2tree, tree2conlltags\n",
"from pprint import pprint\n",
"iob_tagged = tree2conlltags(cs)\n",
"pprint(iob_tagged)\n",
"\n",
"ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))\n",
"print(ne_tree)"
],
"metadata": {
"id": "3KLJcItyYjJL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install ltp"
],
"metadata": {
"id": "LhcmXD4hY24z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from ltp import LTP\n",
"ltp = LTP() # 默认加载 LTP/Small 模型"
],
"metadata": {
"id": "75fnbO_dhX9b"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from ltp import StnSplit\n",
"sents = StnSplit().split(\"汤姆生病了。他去了医院。\")"
],
"metadata": {
"id": "syd9VHQtZ5ai"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sents = StnSplit().batch_split([\"老王怎么今天没来上班?\", \"老王得了甲流,他今天去医院了。\"])\n",
"sents"
],
"metadata": {
"id": "UPfigeDOfKlD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"words = ltp.pipeline([\"老王怎么今天没来上班?\"], tasks = [\"cws\"], return_dict = False)\n",
"words"
],
"metadata": {
"id": "jdY6gLZLfYgt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result = ltp.pipeline([\"老王怎么今天没来上班?\"], tasks = [\"cws\",\"pos\",\"ner\",\"srl\",\"dep\",\"sdp\",\"sdpg\"])\n",
"print(result.cws)\n",
"print(result.pos)\n",
"print(result.ner)\n",
"print(result.srl)\n",
"print(result.dep)\n",
"print(result.sdp)\n",
"print(result.sdpg)"
],
"metadata": {
"id": "_ysoXWNkfsxt"
},
"execution_count": null,
"outputs": []
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment