== 批量删除 == find . -name "*.pdf"|xargs rm -rf find . -name "*mobi" -print0|xargs -0 rm -rf == 删除空文件夹 == find ./aliyunpan/202305/ -type d -empty -delete == 批量加后缀 == find . -type f |xargs mv {} {}.txt find . -type f | xargs -I F mv "F" "F".txt == 查看有没有非txt的 == find . -type f|grep -v "txt" == 将所有非txt结尾的文件都移到一个地方 == find . -type f -print0 |grep -v "txt"|xargs -0 -I F mv "F" "/Users/liuhui/Documents/20230127/tmp/" == 修改后缀名TXT—>txt == find . -type f | grep json |xargs -I F echo "F" "F" |sed 's/json/jsonl/2' |xargs -I {} echo "mv" {} > tmp.sh for file_name in `ls *`; do mv ${file_name} AE${file_name}.json;done for file_name in `ls *`; do mv ${file_name} ${file_name}l;done == 批量删除空文件 == find . -name "*" -type f -size 0c | xargs rm -rf == 批量删除大于1MB的文件 == find . -name "*" -type f -size +1M |xargs rm -rf == 查找100k-200k的pdf == find . -name "*pdf" -type f -size +100k -size -100k == 批量解rar压缩 == for tar in *.rar; do unrar e -o+ $tar; done == 批量解zip压缩 == unzip '*.zip' == 批量删除某个文件夹 == nohup find . -name "zipout*" -type d|xargs rm -rf & == 转码 == for file_name in `ls *.xml`; do ebook-convert ${file_name} .txt --txt-output-formatting=plain;done find . -name "*.xml" | xargs -n 1 -I {} ebook-convert {} .txt --txt-output-formatting=plain find . -name "*.xml" | xargs -i ebook-convert {} .txt --txt-output-formatting=plain == xml转txt == find . -name "*.xml" |xargs -I F echo "F" "F" |sed 's/xml/txt/1' |xargs -I {} echo "html2text -from_encoding utf8 -o" {} html2text --no-wrap-link --ignore-emphasis --ignore-links --ignore-images --ignore-tables --decode-errors 'ignore' vbk2012.mdx.part20.txt utf8 >v20.txt == 将开头n个文件移走 == ls |head -n 10000| xargs -i mv {} ../20230115/ == 加密压缩文件夹 == zip -r -e 20230133.zip ./20230133/ == 压缩 == gzip -9 filename find . -type f | grep jsonl |xargs -I F echo "F" ">F" |sed 's/jsonl/jsonl.gz/2' |xargs -I {} echo "gzip -9 -c" {} zip -v -9 -r demo.zip demo zip -r -9 -e ~/file/3_dir/raw/20230309.zip 20230309/ zip -rq -9 -e 20230301.zip 20230301/ unzip -P 253874 -q 20230301.zip -d ~/bt2/ zip -r ~/network/liuhui/error_log.zip ~/network/liuhui/error_log/ == git常用命令 == git add -A . 来一次添加所有改变的文件 git status git reset HEAD 后面什么都不跟的,就是上一次add 里面的内容全部撤销 git reset HEAD XXX 后面跟文件名,就是对某个文件进行撤销 == 查看编码分布 == awk -F ',' '{count[$2]++;} END {for(i in count) {print " ",i,count[i]}}' check_result.csv == 爬取github == nohup python publicRepos.py --start 99000000 --end 100000000 --github_tokens_file token.txt &>tmp.log & nohup python publicRepos.py --start 199000000 --end 200000000 --github_tokens_file token.txt &>tmp.log & find . -name "*.jsonl" -type f | xargs ls -l for file_name in `ls *.html`; do ebook-convert ${file_name} .txt;done == 爬取github文件 == nohup python repo_list_filter_export.py ~/bt2/bak_file/chinese_202303/2023030 100000 &>tmp.txt & nohup /home/jovyan/file/github_tmp/gh_downloader/gh_jsonl2txt/gh_downloader-linux-amd64 &>tmp.txt & == 编码识别 == python ~/file/github_tmp/charset_mnbvc/convert_files.py -i /Downloads/20230109 -step 1 -r check_result.csv == 分割文件 == split -l 100000 -d -a 2 arxiv-metadata-oai-snapshot.json arxiv-metadata-oai-snapshot.json == 批量kill == ps -ef|grep arxiv|grep -v grep|cut -c 9-16|xargs kill -9 ps aux | grep -v grep | grep programname | awk '{print $2}' | xargs kill == 取出最多时间的 == cat zip19.log |grep "耗时 "| sort -t " " -k 4 -n -r | head -n 10 find . -name tmp.log|xargs ls -l|sort -t " " -k 8 -n -r == 修改downloadstation 最大下载数 == sudo cat /var/packages/DownloadStation/etc/settings.conf == 统计当前文件夹下所有json文件行数 == awk 'END{print NR}' *.json == 进阶:结合xargs,统计22年10月所有json文件行数 == ls -al | grep '202210'| awk '{print $9}' | xargs -I {} awk 'END{print NR}' {} | awk '{sum+=$1} END{print "sum="sum}' == 显示当前文件夹下size最大的前十个文件 == ls -lSh | head -n 10 du -h ./ | sort -hr | head -n 10 == 显示当前文件夹下时间最新的前十个文件 == ls -lt | head -n 10 #倒序 ls -rlt | head -n 10 == 统计当前文件夹下有几种文件类型 == find . -type f | sed -n 's/..*\.//p' | sort | uniq -c == 删除最后一行文件 == sed -i '$d' spider_log.jsonl == docker相关 == #查看历史docker docker ps -a #创建docker docker run -dit -v /mnt/disk1:/mnt/disk1 -v /mnt/disk2:/mnt/disk2 -v /mnt/disk3:/mnt/disk3 --name cpu huggingface/transformers-pytorch-cpu /bin/bash #进入docker docker exec -it cpu /bin/bash #关闭docker docker stop cpu #启动docker docker start cpu #删除创建的docker docker rm $(docker ps -aq) #修改root权限 sudo docker exec -u 0 -it notebook2 bash passwd jovyan apt-get update apt-get upgrade apt-get install vim visudo jovyan ALL=(ALL:ALL) ALL == 查看进程详情 == lsof -p 1234 == 拷贝子文件夹到另一个地方 == cp -R a/. b/ mv a/* b/ == 本文件夹下最大的文件 == find . -type f -exec du -a {} \; | sort -n -r | head -n 10 == 只出现在b文件中的行 == comm -13 a b > c == 远程同步文件夹 == rsync --progress --partial -avz -e "ssh -p 22566" xxx@xxx.org:/home/data/ ./data/